#  CHI<sup>2</sup>-TEST: Testing the Significance

In [1]:
# general sys modules / libraries
import sys
import warnings  
warnings.filterwarnings('ignore') 

# data analysis and visualisation modules / libraries
import numpy as np
from numpy import loadtxt

import pandas as pd

import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
from matplotlib.ticker import PercentFormatter


import seaborn as sns

import scipy.stats as scs
from scipy import stats

# machine learning modules / libraries
import xgboost as xgb
from xgboost import plot_importance
from xgboost import XGBClassifier
from xgboost import XGBRegressor

import sklearn
from sklearn.dummy import DummyClassifier
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

In [2]:
# Read dataset
data = 'datasets/train_chi.csv'
train_chi= pd.read_csv(data)

### Finding 1: City development Index: more than 0.6
***

In [3]:
crosstab = pd.crosstab(train_chi['target'], train_chi['cdi_split'])
display(crosstab)
print("City development index (cdi) split: 0 = index more than 0.6 | 1 = index up to 0.6")

cdi_split,0,1
target,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,12264,1329
1.0,2584,1837


City development index (cdi) split: 0 = index more than 0.6 | 1 = index up to 0.6


In [4]:
stats.chi2_contingency(crosstab)

(2322.8294951089388,
 0.0,
 1,
 array([[11204.0004441,  2388.9995559],
        [ 3643.9995559,   777.0004441]]))

### Finding 2: Company size of current employer: no info
***

In [5]:
crosstab = pd.crosstab(train_chi['target'], train_chi['company_size_split'])
display(crosstab)
print("Company size: 0 = not 10-49 employees | 1 = no info")

company_size_split,0,1
target,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,10489,3104
1.0,2215,2206


Company size: 0 = not 10-49 employees | 1 = no info


In [6]:
stats.chi2_contingency(crosstab)

(1174.0306714640449,
 2.6866438940695114e-257,
 1,
 array([[9586.18141446, 4006.81858554],
        [3117.81858554, 1303.18141446]]))

### Finding 3: Company type of current employer: Pvt Ltd
***

In [7]:
crosstab = pd.crosstab(train_chi['target'], train_chi['company_type_split'])
display(crosstab)
print("Company type: 0 = not Pvt Ltd | 1 = Pvt Ltd")

company_type_split,0,1
target,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,5792,7801
1.0,2747,1674


Company type: 0 = not Pvt Ltd | 1 = Pvt Ltd


In [8]:
stats.chi2_contingency(crosstab)

(509.3063922690518,
 8.976851603407026e-113,
 1,
 array([[6443.35666704, 7149.64333296],
        [2095.64333296, 2325.35666704]]))

### Finding 4: Education level: Graduate
***

In [9]:
crosstab = pd.crosstab(train_chi['target'], train_chi['education_level_split_g'])
display(crosstab)
print("Education level: 0 = not Graduate | 1 = Graduate")

education_level_split_g,0,1
target,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,5478,8115
1.0,1348,3073


Education level: 0 = not Graduate | 1 = Graduate


In [10]:
stats.chi2_contingency(crosstab)

(135.98044987084796,
 2.0150471087871705e-31,
 1,
 array([[5150.76151882, 8442.23848118],
        [1675.23848118, 2745.76151882]]))

### Finding 5: Relevant Experience in Data Science: experience
***

In [11]:
crosstab = pd.crosstab(train_chi['target'], train_chi['relevant_experience_split'])
display(crosstab)
print("Relevant experience: 0 = no experience in data science | 1 = experience in data science")

relevant_experience_split,0,1
target,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,10418,3175
1.0,2772,1649


Relevant experience: 0 = no experience in data science | 1 = experience in data science


In [12]:
stats.chi2_contingency(crosstab)

(329.97993304686355,
 9.708630381245994e-74,
 1,
 array([[9952.9071833, 3640.0928167],
        [3237.0928167, 1183.9071833]]))

### Finding 6: Last job: 0 year between previous and current job
***

In [13]:
crosstab = pd.crosstab(train_chi['target'], train_chi['last_new_job_split'])
display(crosstab)
print("Last Job: 0 = not 1 year between previous and current job | 1 = 0 year between previous and current job")

last_new_job_split,0,1
target,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,12089,1504
1.0,3738,683


Last Job: 0 = not 1 year between previous and current job | 1 = 0 year between previous and current job


In [14]:
stats.chi2_contingency(crosstab)

(59.71170762876474,
 1.0982128930576504e-14,
 1,
 array([[11942.73404019,  1650.26595981],
        [ 3884.26595981,   536.73404019]]))

### Finding 7: Work Experience: up to 5 years
***

In [15]:
crosstab = pd.crosstab(train_chi['target'], train_chi['experience_split_5'])
display(crosstab)
print("Experience split: 0 = work experience not more than 20 years | 1 = work experience up to 5 years")

experience_split_5,0,1
target,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,10735,2858
1.0,2835,1586


Experience split: 0 = work experience not more than 20 years | 1 = work experience up to 5 years


In [16]:
stats.chi2_contingency(crosstab)

(394.9956284387903,
 6.766130893161576e-88,
 1,
 array([[10239.64749639,  3353.35250361],
        [ 3330.35250361,  1090.64749639]]))

### Finding 8: Work Experience: more than 20 years
***

In [17]:
crosstab = pd.crosstab(train_chi['target'], train_chi['experience_split_20'])
display(crosstab)
print("Experience split: 0 = work experience not more than 20 years | 1 = work experience more than 20 years")

experience_split_20,0,1
target,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,10897,2696
1.0,3935,486


Experience split: 0 = work experience not more than 20 years | 1 = work experience more than 20 years


In [18]:
stats.chi2_contingency(crosstab)

(178.6695377797575,
 9.460629348890436e-41,
 1,
 array([[11191.92716776,  2401.07283224],
        [ 3640.07283224,   780.92716776]]))

### Finding 9: Enrolled University: no course
***

In [19]:
crosstab = pd.crosstab(train_chi['target'], train_chi['enrolled_university_split'])
display(crosstab)
print("Enrolled University: 0 = no full time course at university | 1 = no course at university")

enrolled_university_split,0,1
target,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,3051,10542
1.0,1615,2806


Enrolled University: 0 = no full time course at university | 1 = no course at university


In [20]:
stats.chi2_contingency(crosstab)

(344.08275774296783,
 8.236334285791009e-77,
 1,
 array([[ 3520.86921283, 10072.13078717],
        [ 1145.13078717,  3275.86921283]]))

### Finding 10: Education level: High School
***

In [21]:
crosstab = pd.crosstab(train_chi['target'], train_chi['education_level_split_h'])
display(crosstab)
print("Education level: 0 = not Graduate | 1 = High School")

education_level_split_h,0,1
target,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,12057,1536
1.0,4049,372


Education level: 0 = not Graduate | 1 = High School


In [22]:
stats.chi2_contingency(crosstab)

(29.02773481966645,
 7.134944157700591e-08,
 1,
 array([[12153.26179638,  1439.73820362],
        [ 3952.73820362,   468.26179638]]))

Variable name | Chi<sup>2</sup> value | p-value | significant?
:--- | ---: | ---: | ---:
1. City development Index: more than 0.6 | 2322.83 | < .001 | significant
2. Company size: no info | 1174.03 |  < .001 | significant
3. Company type of current employer: Pvt Ltd | 509.31 | < .001 | significant
4. Education level: Graduate | 135.98 |  < .001 | significant
5. Relevant Experience in Data Science: has experience | 329.98 | < .001 | significant
6. Last job: 0 year between previous and current job | 59.71 | < .001 | significant
7. Work Experience: up to 5 years | 395.00 | < .001 | significant
8. Work Experience: more than 20 years | 178.6 | < .001 | significant
9. Enrolled University: no course | 344.08 | < .001 | significant
10. Education level: Graduate |  29.03|   < .001 | significant