In [14]:
import pandas as pd
from numpy import arange
import math 
import matplotlib.pyplot as plt
import numpy as np

TARGET_FILE = 'final.csv'

file = pd.read_csv(TARGET_FILE, encoding = "ISO-8859-1")

test_data = file.loc[file["VCE Median Study Score"] >= 0]

test_data = test_data.loc[test_data["Median Income in Postcode"] >= 0]

In [15]:
import scipy.stats as stats
from scipy.stats import chi2_contingency


# assign thresholds manually based on domain knowledge
# 4 bins: 1st bin [-inf, 40000) "Low", 2nd bin [40000, 50000) "Medium", 3rd bin [50000, 60000) "High", 4th bin [60000, inf) "Sup"
MedianIncome = []
for x in list(test_data["Median Income in Postcode"]):
    if (x < 40000):
        MedianIncome.append(1)
    elif (x >= 40000 and x < 50000):
        MedianIncome.append(2)  
    elif (x >= 50000 and x < 60000):
        MedianIncome.append(3)
    elif (x >= 60000):
        MedianIncome.append(4)
MedianIncome.sort()


AverageIncome = []
for j in list(test_data["Average Income in Postcode"]):
    if (j < 40000):
        AverageIncome.append(1)
    elif (j >= 40000 and j < 50000):
        AverageIncome.append(2)  
    elif (j >= 50000 and j < 60000):
        AverageIncome.append(3)
    elif (j >= 60000):
        AverageIncome.append(4)
AverageIncome.sort()


# equal lenght binning
# 4 bins: 1st bin [20, 25) Low, 2nd bin [25, 30) Medium, 3rd bin [30,35) High, 4th bin [35, 40) Superior
MedianScore = []
for k in list(test_data["VCE Median Study Score"]):
    if (k >= 20 and k < 25):
        MedianScore.append(1)
    elif (k >= 25 and k < 30):
        MedianScore.append(2)
    elif (k >= 30 and k < 35):
        MedianScore.append(3)
    elif (k >= 35 and k < 40):
        MedianScore.append(4)
MedianScore.sort()


# equal lenght binning
# 4 bins: 1st bin [20, 25) Low, 2nd bin [25, 30) Medium, 3rd bin [30,35) High, 4th bin [35, 40) Superior
Score40 = []
for i in list(test_data["Percentage of study scores of 40 and over"]):
    if (i >= 0 and i < 10):
        Score40.append(1)
    elif (i >= 10 and i < 20):
        Score40.append(2)
    elif (i >= 20 and i < 30):
        Score40.append(3)
    elif (i >= 30 and i < 40):
        Score40.append(4)
Score40.sort()


median_average_medScore = []
median_average_40Score = []
for index in range(len(MedianIncome)):
    median_average_medScore.append([MedianIncome[index], AverageIncome[index], MedianScore[index]])
    median_average_40Score.append([MedianIncome[index], AverageIncome[index], Score40[index]])
        

In [16]:
data = pd.DataFrame(np.array(median_average_40Score), columns=['Median Income','Average Income','Percentage of Study Scores 40 and Over'])
features=data[['Median Income','Average Income']]
class_label = data['Percentage of Study Scores 40 and Over']
print("Feature Selection on IV: Median Income & Average Income, DV: Percentage of Study Scores 40 and Over.\n4 Rows & 4 Cols for the Contingency tables\n")
print("With scientifically significant value of 0.05, Degree of Freedom: 9\n")
for feature in ['Median Income','Average Income'] :
    cont_table = pd.crosstab(class_label,features[feature])
    chi2_val, p, dof, expected = stats.chi2_contingency(cont_table.values, correction=False)
    print('Chi2 value for feature', feature,': ',chi2_val)
    if(p < 0.005) : 
        print('Null hypothesis rejected for feature', feature, '-> p value:', p, '\n')
    else :
        print('Null hypothesis accepted for feature', feature, '-> p value:', p, '\n')

print("\n")
print("With scientifically significant value of 0.0001, Degree of Freedom: 9\n")
for feature in ['Median Income','Average Income'] :
    cont_table = pd.crosstab(class_label,features[feature])
    chi2_val, p, dof, expected = stats.chi2_contingency(cont_table.values, correction=False)
    print('Chi2 value for feature', feature,': ',chi2_val)
    if(p < 0.0001) : 
        print('Null hypothesis rejected for feature', feature, '-> p value:', p, '\n')
    else :
        print('Null hypothesis accepted for feature', feature, '-> p value:', p, '\n')

Feature Selection on IV: Median Income & Average Income, DV: Percentage of Study Scores 40 and Over.
4 Rows & 4 Cols for the Contingency tables

With scientifically significant value of 0.05, Degree of Freedom: 9

Chi2 value for feature Median Income :  157.93066866988966
Null hypothesis rejected for feature Median Income -> p value: 1.6241736778085346e-31 

Chi2 value for feature Average Income :  20.966560509554142
Null hypothesis rejected for feature Average Income -> p value: 0.0003215401405155043 



With scientifically significant value of 0.0001, Degree of Freedom: 9

Chi2 value for feature Median Income :  157.93066866988966
Null hypothesis rejected for feature Median Income -> p value: 1.6241736778085346e-31 

Chi2 value for feature Average Income :  20.966560509554142
Null hypothesis accepted for feature Average Income -> p value: 0.0003215401405155043 



In [17]:
data = pd.DataFrame(np.array(median_average_medScore), columns=['Median Income','Average Income','VCE Median Score'])
features=data[['Median Income','Average Income']]
class_label = data['VCE Median Score']
print("Feature Selection on IV: Median Income & Average Income, DV: VCE Median Score.\n4 Rows & 4 Cols for the Contingency tables\n")
print("With scientifically significant value of 0.05, , Degree of Freedom: 9\n")
for feature in ['Median Income','Average Income'] :
    cont_table = pd.crosstab(class_label,features[feature])
    chi2_val, p, dof, expected = stats.chi2_contingency(cont_table.values, correction=False)
    print('Chi2 value for feature', feature,': ',chi2_val)
    if(p < 0.005) : 
        print('Null hypothesis rejected for feature', feature, '-> p value:', p, '\n')
    else :
        print('Null hypothesis accepted for feature', feature, '-> p value:', p, '\n')
        
print("\n")
print("With scientifically significant value of 0.0001, Degree of Freedom: 9\n")
for feature in ['Median Income','Average Income'] :
    cont_table = pd.crosstab(class_label,features[feature])
    chi2_val, p, dof, expected = stats.chi2_contingency(cont_table.values, correction=False)
    print('Chi2 value for feature', feature,': ',chi2_val)
    if(p < 0.0001) : 
        print('Null hypothesis rejected for feature', feature, '-> p value:', p, '\n')
    else :
        print('Null hypothesis accepted for feature', feature, '-> p value:', p, '\n')

Feature Selection on IV: Median Income & Average Income, DV: VCE Median Score.
4 Rows & 4 Cols for the Contingency tables

With scientifically significant value of 0.05, , Degree of Freedom: 9

Chi2 value for feature Median Income :  142.6623683045903
Null hypothesis rejected for feature Median Income -> p value: 2.9081959807203777e-26 

Chi2 value for feature Average Income :  158.35643564356437
Null hypothesis rejected for feature Average Income -> p value: 1.3197366168946291e-31 



With scientifically significant value of 0.0001, Degree of Freedom: 9

Chi2 value for feature Median Income :  142.6623683045903
Null hypothesis rejected for feature Median Income -> p value: 2.9081959807203777e-26 

Chi2 value for feature Average Income :  158.35643564356437
Null hypothesis rejected for feature Average Income -> p value: 1.3197366168946291e-31 

