In [None]:
import json
compFolder = '../DSS_2021/DataScienceSummit2021_Competition/dss_theater_datacomp_2021/'

In [None]:
jsonFile = open(compFolder+'formattedTrainingData.json')

In [None]:
trainingData = json.load(jsonFile)
jsonFile.close()

In [None]:
print(list(trainingData.keys())[0:10])

In [None]:
print(trainingData['-1731829955'])

In [None]:
from math import log,floor

In [None]:
#Function to calculate the entropy for a given data set
def getEntropy(data):
    entropy = 0
    total = len(data)
    counts = [0,0]
    #For each data item, check the class and count the number of instances
    for user in data:
        counts[data[user]['TARGET_PANDEMIC_THTR_1']] += 1
    #Summation of probability times log base 2 of probability for each class
    for count in counts:
        prob = count/total
        entropy -= (prob)*log(prob,2)
    return entropy

In [None]:
#Function to count the classes for each known value of each attribute 
#Returns detailed information in the following form
#{attribute1: [[value1,value2...'total'], --list of known values
#              {value1:[[class1,class2,...'total'], --list of classes for known value
#                       {class1: count,class2: count,...} --counts for each class
#                        ,...]}] 
# , attribute2:...]}
def getOutcomeData(data):
    expectedEntropies = []
    classEntropies = []
    classes = []
    outcomes = {}
    total = len(data)
    for item in data:
        currItem = data[item]
        for key in currItem:
            currItemVal = currItem[key]
            if(key not in classes):
                classes.append(key)
                outcomes.update({key: [[],{}]})
            if(currItemVal not in outcomes[key][0]):
                outcomes[key][0].append(currItemVal)
                outcomes[key][1].update({currItemVal: [[],{}]})
            if(currItem['TARGET_PANDEMIC_THTR_1'] not in outcomes[key][1][currItemVal][0]):
                outcomes[key][1][currItemVal][0].append(currItem['TARGET_PANDEMIC_THTR_1'])
                outcomes[key][1][currItemVal][1].update({currItem['TARGET_PANDEMIC_THTR_1']: 1})
            else:
                outcomes[key][1][currItemVal][1][currItem['TARGET_PANDEMIC_THTR_1']] += 1
            if('total' not in outcomes[key][1][currItemVal][0]):
                outcomes[key][1][currItemVal][0].append('total')
                outcomes[key][1][currItemVal][1].update({'total': 1})
            else:
                outcomes[key][1][currItemVal][1]['total'] += 1
    return outcomes

In [None]:
#Calculates information gain for each known value
def getInformationGain(outcomes, overallEntropy):
    gains = {} 
    total = len(trainingData)
    for attribute in outcomes:
        classEntropy = 0
        for category in outcomes[attribute][0]:
            categoryEntropy = 0
            for classification in outcomes[attribute][1][category][0]:
                if(classification != 'total'):
                    prob = outcomes[attribute][1][category][1][classification]/outcomes[attribute][1][category][1]['total']
                    categoryEntropy -= prob*log(prob,2)
            classEntropy += (outcomes[attribute][1][category][1]['total']/total)*categoryEntropy
        gains.update({attribute: overallEntropy-classEntropy})
        if(attribute == 'TARGET_PANDEMIC_THTR_1'):
            gains.update({attribute: 0})
    return gains

In [None]:
#Returns a simplified list of original data that strips the current best attribute
#and matches rows for the current known value
def getCleanData(originalData, outcomeData, bestAttribute, currentKnownValue):
    newData = {}
    pureOutcome = [False,False]
    #Check to see if there is only one class for the current known value
    if(len(outcomeData[bestAttribute][1][currentKnownValue][0]) == 2):
        pureOutcome[0] = True
    i = 0
    #Check each row for current known value at the current best attribute. 
    #If it matches append it to the next data set
    for data in originalData:
        currItem = originalData[data]
        if(pureOutcome[0] != True and currItem[bestAttribute] == currentKnownValue):
            newData[data] = {}
            for key in currItem:
                if(key != bestAttribute):
                    newData[data].update({key:currItem[key]})
            i += 1
        if(pureOutcome[0] and currItem[bestAttribute] == currentKnownValue):
            pureOutcome[1] = currItem['TARGET_PANDEMIC_THTR_1']
    return (newData, pureOutcome)

In [None]:
#Recursive function to create a decision tree
def getDecisionTreeLayer(dataLayer):
#     print('Data set')
#     print(dataLayer)
    classes = []
    counts = []
    for data in dataLayer:
        currItem = dataLayer[data]
        if(currItem['TARGET_PANDEMIC_THTR_1'] not in classes):
            classes.append(currItem['TARGET_PANDEMIC_THTR_1'])
            counts.append(1)
        else:
            counts[classes.index(currItem['TARGET_PANDEMIC_THTR_1'])] += 1
    #If samples are all of the same class then return that class as a node
    if(len(classes) == 1):
        return classes[0]
    attributes = []
    for key in dataLayer[list(dataLayer.keys())[0]]:
        attributes.append(key)
    #If attribute list is empty, return the majority class
    if(len(attributes) == 0):
        return classes[counts.index(max(counts))]
#     print('Classes')
#     print(classes)
    entropy = getEntropy(dataLayer)
#     print('Entropy')
#     print(entropy)
    outcomeData = getOutcomeData(dataLayer)
#     print('Outcomes')
#     print(outcomeData)
    informationGain = getInformationGain(outcomeData, entropy)
#     print('Gains')
#     print(informationGain)
    maxGain = max(informationGain.values())
#     print('Max gain')
#     print(maxGain)
    #select test attribute, the attribute with the most information gain
    bestAttribute = [k for k,v in informationGain.items() if v == maxGain][0]
    treeLayer = (bestAttribute, {})
    #for each known value of test attribute create a branch
    for knownValue in outcomeData[bestAttribute][0]:
        if(knownValue != 'total'):
            #let the next data set be all rows where the current best attribute
            #has the current known value and remove the current best attribute
            cleanData = getCleanData(dataLayer,outcomeData,bestAttribute,knownValue)
            #if the resulting data set has only one class return that as a node
            if cleanData[1][0]:
#                 print('Pure outcome')
#                 print(cleanData)
                treeLayer[1].update({knownValue:cleanData[1][1]})
#                 print(treeLayer)
            #else return node generated by recursive call on new data set
            else:
#                 print('Mixed outcome')
                treeLayer[1].update({knownValue:getDecisionTreeLayer(cleanData[0])})
#                 print(treeLayer)
    treeLayer[1].update({None:classes[counts.index(max(counts))]})
    return treeLayer

In [None]:
def testDecisionTree(tree, data):
    if(tree == 0 or tree == 1):
        return tree
    else:
        if data[tree[0]] in tree[1].keys():
            return testDecisionTree(tree[1][data[tree[0]]], data)
        else:
            return testDecisionTree(tree[1][None], data)

In [None]:
from random import random,seed

In [None]:
testingSize = 10000
testingIndeces = []
seed(0)
while(len(testingIndeces) < testingSize):
    ranNum = floor(len(trainingData)*random())
    if list(trainingData.keys())[ranNum] not in testingIndeces:
        testingIndeces.append(list(trainingData.keys())[ranNum])
training_data = {}
testing_data = {}
for i in range(0, len(trainingData)):
    if list(trainingData.keys())[i] not in testingIndeces:
        training_data[list(trainingData.keys())[i]] = trainingData[list(trainingData.keys())[i]]
    else:
        testing_data[list(trainingData.keys())[i]] = trainingData[list(trainingData.keys())[i]]
#print("Testing samples indeces: ", testingIndeces)
decisionTree = getDecisionTreeLayer(training_data)
treeClassifications = []
for row in testing_data:
    treeClassifications.append(testDecisionTree(decisionTree, testing_data[row]))
#print("Correct classifications: ", [testing_data[row]['TARGET_PANDEMIC_THTR_1'] for row in testing_data])
#print("Model classifications: ", treeClassifications)
numCorrect = 0
for i in range(0, len(testing_data)):
    if testing_data[list(testing_data.keys())[i]]['TARGET_PANDEMIC_THTR_1'] == treeClassifications[i]:
        numCorrect += 1
print("Accuracy: ", (numCorrect/testingSize)*100, '%')

In [None]:
print(decisionTree[1])