In [0]:
import pandas as pd
import numpy as np
import re

In [207]:
from google.colab import drive
drive.mount('/gdrive')

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


In [0]:
colleges = pd.read_csv("/gdrive/My Drive/colleges.csv")


In [0]:
colleges = colleges.drop([100,101])

In [210]:
colleges.describe()

Unnamed: 0,Overall Rank,Undergrad. Enrollment,Student/faculty Ratio,Quality Rank,Cost Rank
count,100.0,100.0,100.0,100.0,100.0
mean,50.21,3768.49,9.97,50.5,49.97
std,28.8277,4113.615487,2.384525,29.011492,29.006601
min,1.0,67.0,3.0,1.0,1.0
25%,25.75,1636.0,9.0,25.75,24.75
50%,50.5,2300.0,10.0,50.5,49.5
75%,74.25,4697.75,11.0,75.25,74.25
max,100.0,29379.0,18.0,100.0,100.0


In [0]:
def check_form(pattern, col):
    #this function checks if all the values of col follow same pattern
    #values that dont follow are replaced by nan
    for i in range(len(colleges.index)):
        temp = str(colleges.loc[i,col])
        if(not re.match(pattern, temp)):
            colleges.at[i,col] = pd.np.nan            
            
def check_form_SAT():
    col = "*SAT or ACT"
    for i in range(len(colleges.index)):
        temp = str(colleges.loc[i,col])
        if(not re.match(r"^[0-9]{1,3}/[0-9]{1,3}%$", temp)):
            if(not re.match(r"^[0-9]{1,3}%$", temp)):
                colleges.at[i,col] = pd.np.nan
            else:
                colleges.at[i,col] = f"{temp[:-1]}/{temp}"

  

In [0]:
check_form(r"^[0-9]{1,3}%$","Admission Rate")
check_form(r"^[0-9]{1,3}%$","4-year Grad. Rate")
check_form(r"^[0-9]{1,3}%$","6-year Grad. Rate")
check_form(r"^[0-9]{1,3}%$","Aid From Grants")
check_form(r"^[0-9]{1,3}%$","Need Met")
check_form(r"^[0-9]{1,3}%$","Non-Need-Based Aid+")
check_form_SAT()


In [0]:
#removing missing values 'nan'
def remove_missing():
    global colleges
    remove = []
    for i in range(len(colleges.index)):
        if colleges.loc[i].isnull().values.any():
            remove.append(i)
    colleges = colleges.drop(remove)
    

In [0]:
remove_missing()

In [0]:
def preprocess_cost(col):
    for i in colleges.index:
        temp = str(colleges.loc[i, col])
        temp = temp[1:]
        temp = "".join(temp.split(','))
        temp = int(temp)
        colleges.at[i,col] = temp
    colleges[col] = pd.to_numeric(colleges[col])

def preprocess_rate(col):
    for i in colleges.index:
        temp = str(colleges.loc[i][col])
        temp = int(temp[:-1])
        colleges.at[i,col] = temp
    colleges[col] = pd.to_numeric(colleges[col])
        


In [0]:
preprocess_cost("Total Costs")
preprocess_cost("Average Debt")
preprocess_cost("Cost After Need-based Aid")
preprocess_cost("Cost After Non-Need-Based Aid")

In [0]:
preprocess_rate("Admission Rate")
preprocess_rate("4-year Grad. Rate")
preprocess_rate("6-year Grad. Rate")
preprocess_rate("Aid From Grants")
preprocess_rate("Need Met")
preprocess_rate("Non-Need-Based Aid+")


In [218]:
colleges.describe()

Unnamed: 0,Overall Rank,Undergrad. Enrollment,Admission Rate,Student/faculty Ratio,4-year Grad. Rate,6-year Grad. Rate,Quality Rank,Total Costs,Cost After Need-based Aid,Need Met,Aid From Grants,Cost After Non-Need-Based Aid,Non-Need-Based Aid+,Average Debt,Cost Rank
count,72.0,72.0,72.0,72.0,72.0,72.0,72.0,72.0,72.0,72.0,72.0,72.0,72.0,72.0,72.0
mean,56.638889,3771.375,47.972222,10.430556,69.319444,80.388889,59.319444,33859.388889,18988.013889,92.458333,74.680556,25186.805556,32.263889,15342.541667,43.041667
std,27.204652,4503.418768,17.698549,2.413731,16.091137,6.855769,25.087917,6396.296309,4005.97993,13.917197,11.292309,6118.905595,25.205629,6408.606669,28.238141
min,1.0,67.0,14.0,3.0,0.0,67.0,4.0,8079.0,5579.0,20.0,27.0,6523.0,1.0,0.0,1.0
25%,35.75,1625.5,34.0,9.0,63.0,75.0,40.75,30865.5,17231.25,90.75,70.0,21707.25,11.0,13908.0,18.75
50%,59.5,2279.0,44.5,11.0,72.0,81.0,60.5,36176.0,19011.0,98.5,76.0,25071.5,25.5,15679.5,39.5
75%,77.25,4257.75,62.75,11.25,79.0,85.25,79.25,38447.25,21104.75,100.0,80.25,28830.25,48.5,18903.25,69.25
max,100.0,29379.0,87.0,18.0,89.0,95.0,100.0,40240.0,28677.0,100.0,100.0,37663.0,100.0,28217.0,100.0


In [0]:
#Q1 normalizing attributes in range(0,1)
def normalize_rate(col):
    global colleges
    for i in colleges.index:
        colleges.loc[i,col] = colleges.loc[i,col]/100

        
def normalize_cost(col,new_min,new_max):
    global colleges
    old_max = colleges[col].max()
    old_min = colleges[col].min()
    old_range = old_max-old_min
    new_range = new_max-new_min
    for i in colleges.index:
        colleges.loc[i,col] = ((colleges.loc[i,col]-old_min)*new_range/old_range
                               + new_min)

def normalize_SAT():
    global colleges
    col = "*SAT or ACT"
    #value are of the form 70/80%
    for i in colleges.index:
        s1,s2 = colleges.loc[i,col].split('/')
        s1,s2 = float(s1)/100,float(s2[:-1])/100
        colleges.loc[i,col] = f"{s1}/{s2}"

In [0]:
normalize_SAT()
normalize_rate("Admission Rate")
normalize_rate("4-year Grad. Rate")
normalize_rate("6-year Grad. Rate")
normalize_rate("Aid From Grants")
normalize_rate("Need Met")
normalize_rate("Non-Need-Based Aid+")

In [0]:
normalize_cost("Student/faculty Ratio",0,1)
normalize_cost("Total Costs",0,1)
normalize_cost("Average Debt",0,1)
normalize_cost("Cost After Need-based Aid",0,1)
normalize_cost("Cost After Non-Need-Based Aid",0,1)

In [222]:
colleges.describe()

Unnamed: 0,Overall Rank,Undergrad. Enrollment,Admission Rate,Student/faculty Ratio,4-year Grad. Rate,6-year Grad. Rate,Quality Rank,Total Costs,Cost After Need-based Aid,Need Met,Aid From Grants,Cost After Non-Need-Based Aid,Non-Need-Based Aid+,Average Debt,Cost Rank
count,72.0,72.0,72.0,72.0,72.0,72.0,72.0,72.0,72.0,72.0,72.0,72.0,72.0,72.0,72.0
mean,56.638889,3771.375,0.479722,0.49537,0.693194,0.803889,59.319444,0.801604,0.580527,0.924583,0.746806,0.599351,0.322639,0.543734,43.041667
std,27.204652,4503.418768,0.176985,0.160915,0.160911,0.068558,25.087917,0.198884,0.173434,0.139172,0.112923,0.196497,0.252056,0.227119,28.238141
min,1.0,67.0,0.14,0.0,0.0,0.67,4.0,0.0,0.0,0.2,0.27,0.0,0.01,0.0,1.0
25%,35.75,1625.5,0.34,0.4,0.63,0.75,40.75,0.708513,0.50447,0.9075,0.7,0.487612,0.11,0.492894,18.75
50%,59.5,2279.0,0.445,0.533333,0.72,0.81,60.5,0.873636,0.581522,0.985,0.76,0.595649,0.255,0.555676,39.5
75%,77.25,4257.75,0.6275,0.55,0.79,0.8525,79.25,0.944257,0.672169,1.0,0.8025,0.716354,0.485,0.669924,69.25
max,100.0,29379.0,0.87,1.0,0.89,0.95,100.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,100.0


In [0]:
#discretization
#attributes = ['Student/faculty Ratio', '6-year Grad. Rate', '4-year Grad. Rate',
#              'Need Met','Admission Rate']
def get_range(val):
    if   0<=val<0.1:
        return "0-0.1"
    elif 0.1<=val<0.2:
        return "0.1-0.2"
    elif 0.2<=val<0.3:
        return "0.2-0.3"
    elif 0.3<=val<0.4:
        return "0.3-0.4"
    elif 0.4<=val<0.5:
        return "0.4-0.5"
    elif 0.5<=val<0.6:
        return "0.5-0.6"
    elif 0.6<=val<0.7:
        return "0.6-0.7"
    elif 0.7<=val<0.8:
        return "0.7-0.8"
    elif 0.8<=val<0.9:
        return "0.8-0.9"
    elif 0.9<=val<=1:
        return "0.9-1"
    
def discretize_attr(attribute):
    #discretize numeric attributes
    new_attribute =[]
    for val in colleges.loc[:,attribute]:
        new_attribute.append(get_range(val))
    new_attribute = pd.Series(new_attribute,index=colleges.index)
    colleges.loc[:,attribute] = new_attribute
    
    


In [0]:
attributes = ['Student/faculty Ratio', '6-year Grad. Rate', '4-year Grad. Rate',
              'Need Met','Admission Rate']

for attribute in attributes:
    discretize_attr(attribute)


In [225]:
for attribute in attributes:
    print(set(colleges[attribute].values))

{'0.7-0.8', '0-0.1', '0.2-0.3', '0.9-1', '0.3-0.4', '0.4-0.5', '0.8-0.9', '0.6-0.7', '0.5-0.6', '0.1-0.2'}
{'0.7-0.8', '0.9-1', '0.6-0.7', '0.8-0.9'}
{'0.7-0.8', '0-0.1', '0.3-0.4', '0.4-0.5', '0.8-0.9', '0.6-0.7', '0.5-0.6'}
{'0.7-0.8', '0.2-0.3', '0.9-1', '0.4-0.5', '0.8-0.9', '0.6-0.7'}
{'0.7-0.8', '0.2-0.3', '0.3-0.4', '0.4-0.5', '0.8-0.9', '0.6-0.7', '0.5-0.6', '0.1-0.2'}


In [0]:
#Q2
from math import log


labels = set((colleges['6-year Grad. Rate'].values))
n = len(colleges.index)

def P(si,n):
    if(si==0): return 0
    else: return si/n*log(si/n,2)
    
def entropy(cur_attribute, labels):
    values_cur_attr = set(colleges[cur_attribute].values)    
    S = [[len(colleges[ (colleges['6-year Grad. Rate']==label) & (colleges[cur_attribute]==val)].index)
         for label in labels] for val in values_cur_attr]    
    info = [-sum([si/sum(s)*P(si,sum(s)) for si in s]) for s in S]
    E = sum([sum(s)/n*i for i,s in zip(info,S)])
    return E
    

            
def information_gain_analysis(attributes,labels):
    s = [len(colleges[colleges['6-year Grad. Rate']==label].index) for label in labels]
    info = -sum([P(si,n) for si in s])
    print(f"information:{info}")
    entropies = [entropy(cur_attribute,labels) for cur_attribute in attributes]
    print(f"{10*'#'}Entropies{10*'#'}")
    print(*[f"{cur_attribute}: {e}" for cur_attribute,e
           in zip(attributes,entropies)], sep='\n')
    information_gain = [info - e for e in entropies]
    print(f"{10*'#'}Information gain{10*'#'}")
    print(*[f"{cur_attribute}:{i}" for cur_attribute,i
            in zip(attributes, information_gain)],sep='\n')




In [0]:
attributes = ['Student/faculty Ratio', '4-year Grad. Rate',
              'Need Met','Admission Rate']

In [228]:
#taking '6-year Grad Rate' as class labels
information_gain_analysis(attributes, labels)


information:1.5641876427642254
##########Entropies##########
Student/faculty Ratio: 0.4346867024813673
4-year Grad. Rate: 0.3745826154579989
Need Met: 0.4471344296710411
Admission Rate: 0.37288575984706507
##########Information gain##########
Student/faculty Ratio:1.129500940282858
4-year Grad. Rate:1.1896050273062264
Need Met:1.1170532130931843
Admission Rate:1.1913018829171604
