In [52]:
import numpy as np
import pandas as pd
import re
import ast
import math

In [38]:
#path = input('Enter path to master dataset')
path = 'UpdatedMaster19Feb_NLP.csv'
master = pd.read_csv (path) 

In [39]:
master.columns

Index(['Unnamed: 0', 'USN', '10thPercentage', '12thPercentage', 'Branch',
       'CGPA', 'CourseName', 'CourseGrade', 'CompanyName_New', 'CTC',
       'TierLevel', 'CoCurricularActivities', 'EmploymentType',
       'EventsParticipated', 'GeneralSkills', 'Languages', 'MinorAttended',
       'NoofInternships', 'InternshipProjectDomain', 'InternshipCompany',
       'NoofProjects', 'ProjectDetailDomain', 'ProgLanguages', 'Publication',
       'ResearchDomain', 'ScholarshipsKey', 'SoftwareTools',
       'VolunteeringWork', 'WorkshopsOrg', 'WorkshopsDomain', 'AwardsNLPkey',
       'ExternalCertificatesKey', 'ExternalCertificatesDomain'],
      dtype='object')

In [40]:
masterTierCGPA = master[["TierLevel","CGPA"]]

In [41]:
masterTierCGPA.isnull().sum(axis = 0)

TierLevel    49
CGPA          1
dtype: int64

In [42]:
masterTierCGPA.agg(lambda x: x.eq(0).sum())


TierLevel    0
CGPA         0
dtype: int64

In [43]:
masterTierCGPA = masterTierCGPA.dropna()
masterTierCGPA.isnull().sum(axis = 0)

TierLevel    0
CGPA         0
dtype: int64

In [44]:
masterTierCGPA['TierLevel'].dtypes
masterTierCGPA['CGPA'].dtypes

dtype('float64')

In [45]:
def masterDataBinSplitonTier( data, tierVal):
    dataset = data.copy()
    #create a new column for splitting as binary values, if it is the given tier, then will be labelled 1 , else 0
    dataset['TierBinSplitVal'] = (dataset.TierLevel == float(tierVal)).map({True:1 , False: 0})
    return dataset
    

In [50]:
def masterDataBinSplitonCGPA( data, CGPA):
    dataset = data.copy()
    #create a new col for CGPA interval, if it is greater than or equal to given cgpa then we label as 1, else 0
    dataset['CGPABinSplitVal'] = (dataset.CGPA >= float(CGPA)).map({True:1 , False: 0})
    return dataset

In [21]:
def calc_entropy(column):
    """
    Calculate entropy given a pandas series, list, or numpy array.
    """
    # Compute the counts of each unique value in the column
    counts = np.bincount(column)
    # Divide by the total column length to get a probability
    probabilities = counts / len(column)
    
    # Initialize the entropy to 0
    entropy = 0
    # Loop through the probabilities, and add each one to the total entropy
    for prob in probabilities:
        if prob > 0:
            # use log from math and set base to 2
            entropy += prob * math.log(prob, 2)
    
    return -entropy

In [22]:
def calc_information_gain(data, split_name, target_name):
    """
    Calculate information gain given a data set, column to split on, and target
    """
    # Calculate the original entropy
    original_entropy = calc_entropy(data[target_name])
    
    #Find the unique values in the column
    values = data[split_name].unique()
    
    
    # Make two subsets of the data, based on the unique values
    left_split = data[data[split_name] == values[0]]
    right_split = data[data[split_name] == values[1]]
    
    # Loop through the splits and calculate the subset entropies
    to_subtract = 0
    for subset in [left_split, right_split]:
        prob = (subset.shape[0] / data.shape[0]) 
        to_subtract += prob * calc_entropy(subset[target_name])
    
    # Return information gain
    return original_entropy - to_subtract

In [47]:
#split dataset on tier, n this case trying Tier1
masterTierCGPABin = masterDataBinSplitonTier(masterTierCGPA, 1)

In [49]:
masterTierCGPABin

Unnamed: 0,TierLevel,CGPA,TierBinSplitVal
0,1.0,9.68,1
1,2.0,8.83,0
2,1.0,8.83,1
4,1.0,8.95,1
5,1.0,8.78,1
7,1.0,9.89,1
8,1.0,9.39,1
9,1.0,9.04,1
11,2.0,7.71,0
12,1.0,9.90,1


In [56]:
# for CGPA ranging from 7 to 10, we check for each whole number what the info gain is when we split the dataset as <cgpa and >=cgpa
for i in range(7,10):
    #create a temporary dataset with binary value indicating the tier split
      print(i)
    tempData = masterDataBinSplitonCGPA(masterTierCGPABin, i)
    print(calc_information_gain(tempData, 'CGPABinSplitVal', 'TierBinSplitVal'))

7
0.011114368274463104
8
0.1339604405199133
9
0.14220782440954216


In [63]:
#Now repeat the above process for CGPA by trying for each value differing by 0.1,
#parallely see which threshold CGPA gives the max info gain(min entropy) and update the same
#store the information gains for each CGPA threshold in infGainMap
i = 7.0
infGainMap = {}
maxInfGain = [-1,-1]
while(i<9.9):
    print(i)
    i = round(i,1)
    tempData = masterDataBinSplitonCGPA(masterTierCGPABin, i)
    infGain = calc_information_gain(tempData, 'CGPABinSplitVal', 'TierBinSplitVal')
    infGainMap[i] = infGain
    if(maxInfGain[1]< infGain):
        maxInfGain[0] = i
        maxInfGain[1] = infGain
    print(infGain)
    print("\n")
    i= i +0.1
    
print("\n\nMaximum information gain is found for :" + str(maxInfGain[0]) + " with value " + str(maxInfGain[1]))


7.0
0.011114368274463104


7.1
0.018377071727758487


7.199999999999999
0.034388434659378464


7.3
0.03399235708563453


7.3999999999999995
0.0545481095476138


7.5
0.06325447978613852


7.6
0.06311654553393098


7.699999999999999
0.07640029636920798


7.8
0.12943731463083086


7.8999999999999995
0.14525975199298435


8.0
0.1339604405199133


8.1
0.14902904584861165


8.2
0.14276250849360517


8.299999999999999
0.18110311431302528


8.4
0.19438276422912926


8.5
0.1806777931762008


8.6
0.18042452762872885


8.7
0.14038561308279807


8.799999999999999
0.1584908298688894


8.9
0.16162326235449065


9.0
0.14220782440954216


9.1
0.12286328601241847


9.2
0.10044223872293934


9.299999999999999
0.11197991794967344


9.4
0.08876197035576905


9.5
0.060155531197220946


9.6
0.04814476807328705


9.7
0.0310048698541997


9.799999999999999
0.020815889695888545




Maximum information gain is found for :8.4 with value 0.19438276422912926
