# CS 583 Project 1

### Fangda Fan, Xiaohan Liu

- Implement: MS-Apriori (excluding rule generation)
- Consider: multiple minimum supports, support difference constraint, and item constraints
- Item constraints: Two types
    - Cannot–be-together: sets of items cannot be in the same itemsets (pairwise), 
        - e.g., {1, 2, 3} and {6, 7, 9, 10}
    - Must-have: every itemset must have, 
        - e.g., (1 or 2)
- Deadline: Feb 9, 2017 

In [1]:
import numpy as np
import scipy.sparse as sp
import pandas as pd

## Read Data and Arguments

### Read Arguments

In [2]:
def input_args(file_args):
    ms_i = []
    ms_val = []
    sdc = 1
    x_cannot = []
    x_must = []
    for i in open(file_args, "r"):
        i = i.rstrip("\n")
        if i.startswith("MIS"):
            j = i.split(" = ")
            ms_i.append(j[0][4:-1])
            ms_val.append(float(j[1]))
        elif i.startswith("SDC"):
            sdc = float(i.split("=")[1])
        elif i.startswith("cannot_be_together"):
            x_cannot = [j.split(", ") for j in i.split(": ")[1][1:-1].split("}, {")]
        elif i.startswith("must"):
            x_must = [j for j in i.split(": ")[1].split(" or ")]
    ms_sort = np.argsort(ms_val)
    op = {"ms_i": np.array(ms_i)[ms_sort], "ms_val": np.array(ms_val)[ms_sort], "sdc": sdc, "x_cannot": x_cannot, "x_must": x_must}
    return(op)

In [3]:
file_args = "proj1_parameter-file.txt"
file_data = "proj1_input-data.txt"
#file_args = "proj1_ex13_args.txt"
#file_data = "proj1_ex13_data.txt"

args = input_args(file_args)
args

{'ms_i': array(['100', '140', '70', '80', '90', '120', '20', '30', '60', '40', '50',
        '10'], 
       dtype='<U3'),
 'ms_val': array([ 0.1 ,  0.15,  0.2 ,  0.2 ,  0.2 ,  0.2 ,  0.3 ,  0.3 ,  0.3 ,
         0.4 ,  0.4 ,  0.43]),
 'sdc': 0.1,
 'x_cannot': [['20', '40'], ['70', '80']],
 'x_must': ['20', '40', '50']}

In [4]:
id_dict = {i[1]: i[0] for i in enumerate(args["ms_i"])}
x_must = [id_dict[i] for i in args["x_must"]]
x_cannot = [tuple(np.sort([id_dict[j] for j in i])) for i in args["x_cannot"]]
print("Index-item dictionary:", args["ms_i"])
print("MIS index", args["ms_val"])
print("Cannot-be-Together index:",  x_cannot)
print("Must-have:", x_must)

Index-item dictionary: ['100' '140' '70' '80' '90' '120' '20' '30' '60' '40' '50' '10']
MIS index [ 0.1   0.15  0.2   0.2   0.2   0.2   0.3   0.3   0.3   0.4   0.4   0.43]
Cannot-be-Together index: [(6, 9), (2, 3)]
Must-have: [6, 9, 10]


### Read Transaction Data

In [5]:
def input_data(file_data, columns):
    s = pd.read_csv(file_data, header = None, sep = "\t",squeeze = True)
    op = s.str[1:-1].str.get_dummies(sep = ", ").reindex(columns = columns, fill_value = 0)
    return(op)

In [6]:
da = input_data(file_data, args["ms_i"])
X = da.values
da

Unnamed: 0,100,140,70,80,90,120,20,30,60,40,50,10
0,0,0,1,1,1,0,1,1,0,0,1,0
1,0,0,1,1,0,0,1,0,0,0,0,1
2,0,0,0,1,0,0,1,0,0,0,0,1
3,0,0,0,1,0,0,1,1,0,0,0,0
4,0,0,0,1,0,0,1,0,0,0,0,0
5,1,1,1,1,1,1,1,1,0,0,1,0


In [7]:
def sup(xL):
    op = np.mean([X[:, i].all(axis = 1) for i in xL], axis = 1)
    return(op)

In [8]:
I = [(i,) for i, ival in enumerate(args["ms_val"])]
Isup = sup(I)
sup_dict = dict(zip(I, Isup))
I, sup_dict

([(0,), (1,), (2,), (3,), (4,), (5,), (6,), (7,), (8,), (9,), (10,), (11,)],
 {(0,): 0.16666666666666666,
  (1,): 0.16666666666666666,
  (2,): 0.5,
  (3,): 1.0,
  (4,): 0.33333333333333331,
  (5,): 0.16666666666666666,
  (6,): 1.0,
  (7,): 0.5,
  (8,): 0.0,
  (9,): 0.0,
  (10,): 0.33333333333333331,
  (11,): 0.33333333333333331})

## 2. Candidate Generation
### Level 1

In [9]:
Li = (Isup > args["ms_val"]).argmax()
L = [i for i in range(Li, len(args["ms_val"])) if Isup[i] > args["ms_val"][Li]]
F = [[(i,) for i in np.where(Isup > args["ms_val"])[0]]]
Li, L, F

(0,
 [0, 1, 2, 3, 4, 5, 6, 7, 10, 11],
 [[(0,), (1,), (2,), (3,), (4,), (6,), (7,)]])

### Level $\geq$ 2

In [10]:
def pair_sup_mis(x):
    x_t = x[:, np.newaxis]
    x_sup = sup(x_t)
    x_mis = [args["ms_val"][i] for i in x]
    x_sup_t = x_sup[:, np.newaxis]
    iL = sp.coo_matrix(np.triu((x_sup_t >= x_mis).T & (np.abs(x_sup_t - x_sup) < args["sdc"]), 1)).nonzero()
    op = list(zip(x[iL[0]], x[iL[1]]))
    return(op)
def frequent(xL):
    x_sup = sup(xL)
    sup_dict.update(dict(zip(xL, x_sup)))
    op = [xL[j] for j in np.where(x_sup >= [args["ms_val"][i[0]] for i in xL])[0]]
    xL_dropfirst = set(tuple(i[1:]) for i in xL)
    sup_dict.update(dict(zip(xL_dropfirst, sup(xL_dropfirst))))
    return(op)
def append_set(xL, x_base):
    if len(xL):
        op = [tuple(i) for i in np.hstack([np.tile(x_base, (len(xL), 1)), xL])]
    else:
        op = []
    return(op)
def prune_candidate(xL):
    if xL:
        op = [xL[l] for l in np.where(np.all([[any(set(k).issubset(j) for k in F[-1]) for j in np.delete(xL, i, axis = 1)] for i in range(1, len(xL[0]))], axis = 0))[0]]
    else:
        op = []
    return(op)

In [11]:
C = [i for i in pair_sup_mis(np.array(L)) if i[0] in np.array(F[0]).T[0]]
C

[(0, 1), (0, 5), (1, 5), (2, 7), (3, 6), (4, 10), (4, 11)]

In [12]:
while C:
    F.append(frequent(C))
    Ls = pd.DataFrame(F[-1])
    C = sum([append_set(pair_sup_mis(group.values), name) for name, group in Ls.groupby(list(range(len(F)-1)))[len(F)-1]], [])
    C = prune_candidate(C)
F

[[(0,), (1,), (2,), (3,), (4,), (6,), (7,)],
 [(0, 1), (0, 5), (1, 5), (2, 7), (3, 6), (4, 10)],
 [(0, 1, 5)]]

### Prune with Item Constraints

In [13]:
F_prune = [[j for j in i if any(k in j for k in x_must) & ~any(set(k).issubset(j) for k in x_cannot)] for i in F]
F_prune

[[(6,)], [(3, 6), (4, 10)], []]

### Item Supports for Association Rule

In [14]:
sup_dict

{(0,): 0.16666666666666666,
 (0, 1): 0.16666666666666666,
 (0, 1, 5): 0.16666666666666666,
 (0, 5): 0.16666666666666666,
 (1,): 0.16666666666666666,
 (1, 5): 0.16666666666666666,
 (2,): 0.5,
 (2, 7): 0.33333333333333331,
 (3,): 1.0,
 (3, 6): 1.0,
 (4,): 0.33333333333333331,
 (4, 10): 0.33333333333333331,
 (4, 11): 0.0,
 (5,): 0.16666666666666666,
 (6,): 1.0,
 (7,): 0.5,
 (8,): 0.0,
 (9,): 0.0,
 (10,): 0.33333333333333331,
 (11,): 0.33333333333333331}

## Output

In [15]:
def output_frequent(F):
    op = []
    for i, ival in enumerate(F):
        if ival:
            op.append("Frequent {}-itemsets\n".format(i+1))
            if i == 0:
                op += ["\t{} : {}".format(int(sup_dict[j]*len(X)), {int(args["ms_i"][k]) for k in j}) for j in ival]
            else:
                op += ["\t{} : {}\nTailcount = {}".format(int(sup_dict[j]*len(X)), {int(args["ms_i"][k]) for k in j}, int(sup_dict[j[1:]]*len(X))) for j in ival]
            op.append("\nTotal number of frequent {}-itemsets = {}\n\n".format(i+1, len(ival)))
    print("\n".join(op))
output_frequent(F_prune)

Frequent 1-itemsets

	6 : {20}

Total number of frequent 1-itemsets = 1


Frequent 2-itemsets

	6 : {80, 20}
Tailcount = 6
	2 : {90, 50}
Tailcount = 2

Total number of frequent 2-itemsets = 2


