# Exercise sheet 3 Problem 2

In [232]:
# Read data from SPECT dataset.
SPECT = open("SPECT-all-data")

m, n, k, F = map(int, SPECT.readline().split())

# Read in database of items.
# This is stored as a dictionary of
# {TID : (attribute list, target attribute)} pairs.
D = []
for i in range(1, m + 1):
    # Read in attributes
    line_data = [(-1)**(int(w) + 1) for w in SPECT.readline().split(',')]
    D.append((line_data[:n], line_data[-1]))

# Population proportion of positive labels:
p0 = sum((1 for d in D if d[1] == 1)) / m

We used C++ to find the best subgroups for F = 1, 2, 3.
Read the output files.

In [233]:
def get_best_groups(path):
    '''Read output files and get best groups'''
    best_groups = []
    F = open(path)
    for line in range(k):
        line_data = F.readline().split()
        hyp = tuple(map(lambda m: int(m[1:]) if m[0] == 'x' else -int(m[2:]), line_data[1:]))
        best_groups.append(hyp)
    return best_groups

Best hypotheses for F = 1, 2, 3:

In [234]:
# Get best groups for F = 1, 2, 3
Fs = ["F1", "F2", "F3"]
best_groups = list(map(get_best_groups, Fs))

for i in [1,2,3]:
    print("F =", i)
    groups = best_groups[i-1]
    for j in range(k):
        print(" ", j+1, ":", groups[j])

F = 1
  1 : (1, 14, 22)
  2 : (14, 22)
  3 : (1, -11, 14, 22)
  4 : (-11, 14, 22)
  5 : (1, 4, -11, 14)
  6 : (1, 22)
  7 : (1, 14, 17, 22)
  8 : (14, 17, 22)
  9 : (1, 9, 14, 22)
  10 : (1, 4, 14, 22)
F = 2
  1 : (1, 14, 22)
  2 : (14, 22)
  3 : (1, 22)
  4 : (1, 14)
  5 : (-22,)
  6 : (22,)
  7 : (1, 9, 14, 22)
  8 : (-19, -22)
  9 : (1, 4, 14)
  10 : (1, -11, 14, 22)
F = 3
  1 : (1, 14, 22)
  2 : (14, 22)
  3 : (1, 22)
  4 : (22,)
  5 : (1, 4, 14)
  6 : (1, 9, 14, 22)
  7 : (1, 14)
  8 : (1, 4, 14, 22)
  9 : (9, 14, 22)
  10 : (4, 14, 22)


In [235]:
import math

def ext(hyp):
    '''Get the extension of a hypothesis.
    
    Return a generator iterating over the indices of items in D satisfying hyp.

    A hypothesis is represented as a tuple of integers from
    {-n, ... , n} in the natural way, e.g. (1, -2, 4) ~ x1!x2x4.
    '''

    for i in range(len(D)):
        satisfied = True
        for attr in hyp:
            # If the attribute of this item has opposite value to that
            # specified by the hypothesis:
            if (attr * D[i][0][abs(attr) - 1] < 0):
                satisfied = False
                break
        if satisfied:
            yield i
            
def V(hyp):
    '''Calculate z-score for hyp.'''
    extension = list(ext(hyp))
    num_positive = sum((1 for k in extension if D[k][1] == 1))
    p = num_positive / len(extension)
    return (p-p0) / math.sqrt(p0*(1-p0) / len(extension))

Calculate z-scores for each discovered group, for each F:

In [248]:
for i in [1,2,3]:
    groups = best_groups[i-1]
    print("F =", i)
    print("\t\tz-score\t\tSignificant?\tHypothesis")
    print('\t', '-'*60)
    for j in range(k):
        hyp = groups[j]
        z = V(hyp)
        print("\t", j + 1, ":", "{:10.4f}".format(z), "\t", abs(z) > 2.58, "\t\t", hyp)
    print()

F = 1
		z-score		Significant?	Hypothesis
	 ------------------------------------------------------------
	 1 :     6.6404 	 True 		 (1, 14, 22)
	 2 :     6.4630 	 True 		 (14, 22)
	 3 :     6.0869 	 True 		 (1, -11, 14, 22)
	 4 :     6.0058 	 True 		 (-11, 14, 22)
	 5 :     6.0058 	 True 		 (1, 4, -11, 14)
	 6 :     5.9516 	 True 		 (1, 22)
	 7 :     5.9309 	 True 		 (1, 14, 17, 22)
	 8 :     5.9309 	 True 		 (14, 17, 22)
	 9 :     5.8518 	 True 		 (1, 9, 14, 22)
	 10 :     5.7945 	 True 		 (1, 4, 14, 22)

F = 2
		z-score		Significant?	Hypothesis
	 ------------------------------------------------------------
	 1 :     6.6404 	 True 		 (1, 14, 22)
	 2 :     6.4630 	 True 		 (14, 22)
	 3 :     5.9516 	 True 		 (1, 22)
	 4 :     5.1794 	 True 		 (1, 14)
	 5 :    -4.2131 	 True 		 (-22,)
	 6 :     5.5776 	 True 		 (22,)
	 7 :     5.8518 	 True 		 (1, 9, 14, 22)
	 8 :    -4.2708 	 True 		 (-19, -22)
	 9 :     5.5052 	 True 		 (1, 4, 14)
	 10 :     6.0869 	 True 		 (1, -11, 14, 22)

F = 3
		z