In [10]:
"""
mapping moods and other factors to themes using identification trees by calculating disorder 
and quality of a test.
Tests for this particular dataset include age, gender,mood,weather, timestamp, etc
"""

import math

#sample data
titles=['age','gender','mood','themes']#tests used for generation of identification trees
#dataset contains abstract answers for each of the above tests gathered from different resources
data=[['forty','male','happy','activity'],
      ['forty','female','happy','food'],
      ['thirty','male','sad','food']]

In [13]:
def only_feat(titles):
    """
    This function returns only the test names such as age, gender, mood and not the final 
    output of themes that has to be given by the identification tree
    """
    of=[]#declaring an empty array
    for each in titles:
        if each!=titles[len(titles)-2]:
            of.append(each)#appending to the empty array, those strings that are not"themes"
    return of


In [14]:
def index(input,titles):
    """
    The function outputs the index of a given test so that values of that test can be 
    accessed easily
    """
    output=0
    for each in titles:
        if input==each:#if the inputted test is in the titles list, it returns the index or column number of that test
            output=titles.index(each)
    return output

In [15]:
def feat_entries(feat,given_data,titles):
    """
    for a given test, the function outputs a list of all the values in the dataset for
    that given test
    """
    val=[]
    i=index(feat,titles)#getting the index/column number of that test
    for row in given_data:
        val.append(row[i])#appending the values of each row for that specified column
    return val

In [17]:
def unique_val(feat,given_data,titles):
    """
    For a given test, the function outputs unique values for that test in the given datset
    """
    unique=[]
    entries=feat_entries(feat,given_data,titles)#obtaining all entries for a given test
    for each in entries:
        if unique.count(each)==0:
            unique.append(each)#if the value of the entries is unique, the value is appended to the list
    return unique

In [18]:
def modify_data(feat,value,given_data,titles):
    """
    This function modifies the dataset such that only the rows that have a certain value of a
    given test are used as the new dataset
    """
    data_mod=[]
    ind=index(feat,titles)#getting the index of a given test
    for row in given_data:
        if (row[ind]==value):#checks if a certain column in the given row has the required value
            data_mod.append(row)#if true, appends, the row to data_mod
    return data_mod

In [20]:
def quality(given_data,feat,titles):
    """
    This test measures the quality of a given test to be used by the identification tree to 
    divide the dataset into branches to get classification
    """
    values=unique_val(feat,given_data,titles)#all the unique values for a given test are obtained
    quality=0.0# initializing our quality parameter to 0
    m=len(given_data)#number of training examples
    dis=0.0#initialzing our disorder variable to zero
    
    
    for each in values:
        data_mod=[]
        ind=index(feat,titles)
        for row in given_data:
            if (row[ind]==each):
                data_mod.append(row)
        """
        The above para is used to get the modified dataset for a given value of a givent test
        """
        n=len(data_mod)#number of columns in the dataset
        
        #initializing few variabes and list
        entr=0.0
        d=0.0
        val=[]
        
        #val is made into a list that contains the final and actual output of the test
        for row in data_mod:
            val.append(row[len(row)-1])
    
        #all the unique class values are stores in the unique array
        unique=[]
        for one in val:
            if unique.count(one)<=0:
                unique.append(one)


        for u in unique:
            c=0.0#for each unique class that is to be classified by identification
            for row in data_mod:
                if (row[len(row)-1]==u):
                    c+=1
                    """
                    The c variable stores the number of training examples in the dataset that have
                    a particular class value u.
                    
                    n is the total number columns in the modified dataset
                    """
            #calculates entropy.the value c divided by n is multiplied by the negative of log of the same value to the base 2
            
            entr += (-c/n)*math.log(c/n,2)

        #calculates disorder
        """
        The disorder is used to measure how effective a given test would be to obtain a homogenous dataset 
        """
        #the calculated 
        d += (n/float(m))*entr
        dis += d

    return dis

In [21]:
def choose_feat(titles,given_data):
    """
    This function chooses the test that has the minimum disorder so that a more homogenous 
    dataset can be obtained
    """
    feat=None
    temp=[]#initialzing an empty array
    for each in titles:
        var=quality(given_data,each,titles)#calcaulates the disorder of all tests
        temp.append(var)
    min_disorder=min(temp)#returns the minimum disorder
    feat=titles[temp.index(min_disorder)]#feat is set as the test with the minimum disorder
    return feat

In [22]:
def create_itree(given_data,titles):
    
    """
    Recursive function that builds the identification tree and branches the dataset to form a homogenous dataset
    """
    header=only_feat(titles)#list containing all the tests
    

    best=choose_feat(header,given_data)#chooses the test with the least disorder for branching to form a homogenous dataset

    tree={best:{}}# initializing a dictionary of lists

    #the loop iterates through the unique values of the chosen test
    for val in unique_val(best,given_data,titles):
    
        data_mod=modify_data(best,val,given_data,titles)#the data is split and a tree is formed for each value

        total_themes=feat_entries("themes",data_mod,titles)#all the possible classes for a given modified dataset are stored in themes
        
        #this if condition checks if the dataset if homogenous
        if((total_themes.count(total_themes[0])==len(total_themes))):
            #if it is, the value of the test and its attribute and the class is appended to the dictionary of lists
            tree[best][val]=total_themes[0]


        else:
            subtree=create_itree(data_mod,titles)
            #if the dataset is not homogenous, the create_itree function is called again.
            #this happens until the entire dataset is homogenous

            tree[best][val]=subtree


    return tree

print(create_itree(data,titles))

{'age': {'forty': {'gender': {'male': 'activity', 'female': 'food'}}, 'thirty': 'food'}}
