<a href="https://colab.research.google.com/github/DerekLeeCS/FreqML/blob/master/Assignment 7/Derek_Lee_Market_Basket.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Modules & Constants

---



In [None]:
import numpy as np
import pandas as pd

# Default thresholds
support = 0.2
conf = 0.2      

# Import Data

---



In [None]:
# Loads data
data = pd.read_csv("https://raw.githubusercontent.com/DerekLeeCS/FreqML/master/Assignment%207/dataset.csv")
data.head()

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,Fungal infection,itching,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,
1,Fungal infection,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
2,Fungal infection,itching,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
3,Fungal infection,itching,skin_rash,dischromic _patches,,,,,,,,,,,,,,
4,Fungal infection,itching,skin_rash,nodal_skin_eruptions,,,,,,,,,,,,,,


# Association Rule-Generation Functions

---

Code from Listing 11.3 of textbook

In [None]:
def generateRules(L, supportData, minConf=conf):
    bigRuleList = []
    for i in range(1, len(L)):
        for freqSet in L[i]:
            H1 = [frozenset([item]) for item in freqSet]
            if (i > 1):
                rulesFromConseq(freqSet, H1, supportData, bigRuleList, minConf)
            else:
                calcConf(freqSet, H1, supportData, bigRuleList, minConf)
    return bigRuleList

def calcConf(freqSet, H, supportData, brl, minConf=conf):
    prunedH = []
    for conseq in H:
        conf = supportData[freqSet]/supportData[freqSet-conseq]
        if conf >= minConf:
            print(freqSet-conseq,'-->',conseq,'conf:',conf)
            brl.append((freqSet-conseq, conseq, conf))
            prunedH.append(conseq)
    return prunedH

def rulesFromConseq(freqSet, H, supportData, brl, minConf=conf):
    m = len(H[0])
    if (len(freqSet) > (m + 1)):
        Hmp1 = aprioriGen(H, m + 1)
        Hmp1 = calcConf(freqSet, Hmp1, supportData, brl, minConf)
        if (len(Hmp1) > 1):
            rulesFromConseq(freqSet, Hmp1, supportData, brl, minConf)

# Market Basket

---

I ended up rewriting the textbook code for Apriori.py so it would work with strings. 
<br><br>
The support for the data is very small (0.2 and up will give no rules). However, the confidence is very high. 
<br><br>
Surprisingly, there were no `disease -> symptom` or `symptom -> disease` relationships. There were only `symptom -> symptom` relationships. 
<br><br>
In hindsight, the reason for this is clear; There are 41 unique diseases, and 120 examples for each disease. Thus, the only way for a disease to be involved in a rule is if the minimum support is lowered to 1/41. However, that is a bad idea, as most symptoms occur in more than one disease, meaning a single symptom usually occurs more often than a single disease, so a lowered support would also catch many symptoms and create uninteresting rules.

In [None]:
def createC1DF(dataSet):
    tempC1 = []
    # Gets unique values in each column
    for col in dataSet.columns:
        tempC1.append( set(pd.unique(dataSet[col].dropna().values.ravel())) )

    # Gets unique values across all sets
    C1 = tempC1[0]
    for col in tempC1:
        C1 |= col

    # Creates a list of frozensets
    # Each frozenset represents one unique item
    newList = []
    for item in C1:
        newList.append(frozenset({item}))
    return newList

def scanDDF(D, Ck, minSupport):
    ssCnt = {}
    listD = []

    # Creates a list of sets
    # Each set represents one input
    for index, row in D.iterrows():
        listD.append( set(row.dropna().values.ravel()) )

    # Counts frequency of subsets
    for tid in listD:
        for can in Ck:
            if can.issubset(tid):
                if not can in ssCnt: 
                    ssCnt[can] = 1
                else: 
                    ssCnt[can] += 1
        
    numItems = float(len(listD))
    retList = []
    supportData = {}

    # Adds subset to list if support is higher than minimum support
    for key in ssCnt:
        support = ssCnt[key]/numItems
        if support >= minSupport:
            retList.insert(0,key)
        supportData[key] = support
    return retList, supportData

# Creates Ck
def aprioriGenDF(Lk, k): 
    retList = []
    lenLk = len(Lk)
    for i in range(lenLk):
        for j in range(i+1, lenLk):
            L1 = list(Lk[i])[:k-2]; L2 = list(Lk[j])[:k-2]
            L1.sort(); L2.sort()
            if L1==L2:
                retList.append( Lk[i] | Lk[j] )
    return retList

# Apriori Algorithm
def aprioriDF(dataSet, minSupport=support):
    C1 = createC1DF(dataSet)
    L1, supportData = scanDDF(dataSet, C1, minSupport)
    L = [L1]
    k = 2
    while (len(L[k-2]) > 0):
        Ck = aprioriGenDF(L[k-2], k)
        Lk, supK = scanDDF(dataSet, Ck, minSupport)
        supportData.update(supK)
        L.append(Lk)
        k += 1
    return L, supportData


L,suppData = aprioriDF(data,0.15)
rules = generateRules(L,suppData,0.8)
#print(L)
#print(rules)

frozenset({' abdominal_pain'}) --> frozenset({' vomiting'}) conf: 0.8430232558139534
frozenset({' yellowish_skin'}) --> frozenset({' abdominal_pain'}) conf: 0.8355263157894737
frozenset({' yellowing_of_eyes'}) --> frozenset({' loss_of_appetite'}) conf: 0.9632352941176471
frozenset({' nausea'}) --> frozenset({' vomiting'}) conf: 0.8534031413612565


# Apriori.py 

---

Code from Listing 11.1 & 11.2 of textbook (Machine Learning in Action) 

In [None]:
def loadDataSet():
    return [[1, 3, 4], [2, 3, 5], [1, 2, 3, 5], [2, 5]]

def createC1(dataSet):
    C1 = []
    for transaction in dataSet:
        for item in transaction:
            if not [item] in C1:
                C1.append([item])
    C1.sort()
    return list(map(frozenset, C1))

def scanD(D, Ck, minSupport):
    ssCnt = {}
    for tid in D:
        for can in Ck: 
            if can.issubset(tid):
                if not can in ssCnt: 
                    ssCnt[can] = 1
                else: 
                    ssCnt[can] += 1
    numItems = float(len(D))
    retList = []
    supportData = {}
    for key in ssCnt:
        support = ssCnt[key]/numItems
        if support >= minSupport:
            retList.insert(0,key)
        supportData[key] = support
    return retList, supportData

# Creates Ck
def aprioriGen(Lk, k): 
    retList = []
    lenLk = len(Lk)
    for i in range(lenLk):
        for j in range(i+1, lenLk):
            L1 = list(Lk[i])[:k-2]; L2 = list(Lk[j])[:k-2]
            L1.sort(); L2.sort()
            if L1==L2:
                retList.append(Lk[i] | Lk[j])
    return retList

def apriori(dataSet, minSupport=support):
    C1 = createC1(dataSet)
    D = list(map(set, dataSet))
    L1, supportData = scanD(D, C1, minSupport)
    L = [L1]
    k = 2
    while (len(L[k-2]) > 0):
        Ck = aprioriGen(L[k-2], k)
        Lk, supK = scanD(D, Ck, minSupport)
        supportData.update(supK)
        L.append(Lk)
        k += 1
    return L, supportData

# Textbook Example

---



In [None]:
dataSet = loadDataSet()
L,suppData = apriori(dataSet,0.7)
rules = generateRules(L,suppData,0.7)
#print(rules)

frozenset({5}) --> frozenset({2}) conf: 1.0
frozenset({2}) --> frozenset({5}) conf: 1.0
