In [1]:
# Predicting whether a fellow will ultimately get placed at a company or not

In [2]:
# Importing libraries and dataset
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import category_encoders as ce

data=pd.read_excel("Data_Pathrise.xlsx")

In [3]:
# Taking a look at the first few rows of the dataset
data.head()

Unnamed: 0,id,pathrise_status,primary_track,cohort_tag,program_duration_days,placed,employment_status,highest_level_of_education,length_of_job_search,biggest_challenge_in_search,professional_experience,work_authorization_status,number_of_interviews,number_of_applications,gender,race
0,1,Active,SWE,OCT19A,,0,Unemployed,Bachelor's Degree,3-5 months,Hearing back on my applications,3-4 years,Canada Citizen,2.0,900,Male,Non-Hispanic White or Euro-American
1,2,Active,PSO,JAN20A,,0,Unemployed,"Some College, No Degree",3-5 months,Getting past final round interviews,1-2 years,Citizen,6.0,0,Male,Non-Hispanic White or Euro-American
2,3,Closed Lost,Design,AUG19B,0.0,0,Employed Part-Time,Master's Degree,Less than one month,Figuring out which jobs to apply for,Less than one year,Citizen,0.0,0,Male,East Asian or Asian American
3,4,Closed Lost,PSO,AUG19B,0.0,0,Contractor,Bachelor's Degree,Less than one month,Getting past final round interviews,Less than one year,Citizen,5.0,25,Male,Decline to Self Identify
4,5,Placed,SWE,AUG19A,89.0,1,Unemployed,Bachelor's Degree,1-2 months,Hearing back on my applications,1-2 years,F1 Visa/OPT,10.0,100,Male,East Asian or Asian American


In [4]:
# The data comprises information about 4 categories of applicants
# 1 - Applicants who got placed through the program
# 2 - Applicants who failed to get placed through the program
# 3 - Applicants who did not accept admission offer or did not continue after the free 14 day trial period
# 4 - Applicants who are currently enrolled in the program

# To build our predictive model we need to analyse the data comprising the first 2 categories as stated above so let's filter to 
# get the same

data = data[(data['pathrise_status']=='MIA') | (data['pathrise_status']=='Placed') | (data['pathrise_status']=='Withdrawn') | (data['pathrise_status']=='Withdrawn (Failed)')]

In [5]:
# Data Preprocessing

# There are 4 columns (id, pathrise_status, cohort_tag and program_duration_days) which do not provide any valuable
# information so we can get rid of them 

cols = [2,6,7,8,9,10,11,12,13,14,15,5]
data = data[data.columns[cols]]

# Renaming column 'employment_status ' to 'employment_status'
data=data.rename(columns={"employment_status ": "employment_status"})

In [6]:
# Let's check the number of missing values in each feature
# There are 9 features having missing values
data.isnull().sum()

primary_track                    0
employment_status              150
highest_level_of_education      30
length_of_job_search            55
biggest_challenge_in_search     16
professional_experience        109
work_authorization_status      186
number_of_interviews           134
number_of_applications           0
gender                         288
race                            12
placed                           0
dtype: int64

In [7]:
# Imputing missing values for gender feature
# Let's check the frequency of available values in the gender column 
data['gender'].value_counts()

# We can see that ~ 75% of the values are Male so it's good enough to impute the missing values in this column with Male
# Also we can remove the rows which contain the values 'Non-Binary' and 'Decline to Self Identify' since there are just 2 and 6 instances of them respectively.
# By eliminating them we can reduce the number of categories to 2 which will make the model simpler

data.drop(data[(data['gender'] == 'Non-Binary')].index, inplace = True)
data.drop(data[(data['gender'] == 'Decline to Self Identify')].index, inplace = True)
data['gender'].fillna(data['gender'].mode()[0], inplace=True)

In [8]:
# Treating missing values for highest_level_of_education,biggest_challenge_in_search,length_of_job_search and race  
# Columns highest_level_of_education,length_of_job_search, biggest_challenge_in_search and race have a low proportion of missing values
# so we can simply drop the appropriate rows 

data = data[data['highest_level_of_education'].notna()]
data = data[data['biggest_challenge_in_search'].notna()]
data = data[data['length_of_job_search'].notna()]
data = data[data['race'].notna()]


In [9]:
# Imputing missing values for employment_status using mode   
# Combining the labels to fall under either 'Employed' or 'Unemployed'
data=data.replace("Student", "Unemployed")
data=data.replace("Employed Full-Time", "Employed")
data=data.replace("Employed Part-Time", "Employed")
data=data.replace("Contractor", "Employed")

data['employment_status'].fillna(data['employment_status'].mode()[0], inplace=True)

In [10]:
# Imputing missing values for work_authorization_status using mode
# Combining similar visas together
data=data.replace("F1 Visa/OPT", "F1")
data=data.replace("F1 Visa/CPT", "F1")
data=data.replace("STEM OPT", "F1")
data=data.replace("Other", "H1B")

data['work_authorization_status'].fillna(data['work_authorization_status'].mode()[0], inplace=True)

In [11]:
# Imputing missing values for professional_experience using mode
data['professional_experience'].fillna(data['professional_experience'].mode()[0], inplace=True)

In [12]:
# Imputing missing values for number_of_interviews using median
data['number_of_interviews'].fillna(data['number_of_interviews'].median(),inplace = True)

In [13]:
# Categorical data encoding
# Since most of the ML models accept only numeric features we need to convert the categorical variables to numbers such that the model is able to understand and extract valuable information.
# We will use the ordinal encoder for the ordinal columns highest_level_of_education,professional_experience and length_of_job_search

encoder= ce.OrdinalEncoder(cols=['highest_level_of_education','professional_experience','length_of_job_search'],return_df=True,
                           mapping=[{'col':'highest_level_of_education',
'mapping':{'Some High School':0,'High School Graduate':1,'GED or equivalent':2,'Some College, No Degree':3,'Bachelor\'s Degree':4,'Master\'s Degree':5,'Doctorate or Professional Degree':6}},
                                    {'col':'professional_experience','mapping':{'Less than one year':0,'1-2 years':1,'3-4 years':2,'5+ years':3}},
                                  {'col':'length_of_job_search','mapping':{'Less than one month':0,'1-2 months':1,'3-5 months':2,'6 months to a year':3,'Over a year':4}}] )
data_transformed = encoder.fit_transform(data)

In [14]:
# We will be using binary encoding for employment_status and gender 
data_transformed["employment_status"] = np.where(data_transformed["employment_status"].str.contains("Employed"), 1, 0)
data_transformed["gender"] = np.where(data_transformed["gender"].str.contains("Male"), 1, 0)

In [15]:
# For the remaining features we will use dummy encoding
data_transformed = pd.get_dummies(data_transformed,drop_first=True)

In [16]:
# Feature Selection
# Chi-Squared Statistic
# From the below test it's very difficult to say which are the most important features other than number_of_applications

from sklearn.feature_selection import chi2
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest

X=data_transformed.drop('placed', 1)
Y=data_transformed['placed']
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=101)
fs = SelectKBest(score_func=chi2, k='all')
fs.fit(X_train, y_train)
X_train_fs = fs.transform(X_train)
X_test_fs = fs.transform(X_test)
for i in range(len(fs.scores_)):
    print('Feature %d: %f' % (i, fs.scores_[i]))

Feature 0: 1.687555
Feature 1: 0.182737
Feature 2: 0.009367
Feature 3: 0.099443
Feature 4: 0.776853
Feature 5: 210.006115
Feature 6: 0.205945
Feature 7: 0.255779
Feature 8: 1.707042
Feature 9: 3.656150
Feature 10: 1.333540
Feature 11: 3.414085
Feature 12: 0.097008
Feature 13: 0.468873
Feature 14: 0.098929
Feature 15: 0.281249
Feature 16: 0.099821
Feature 17: 0.001242
Feature 18: 0.342288
Feature 19: 0.001297
Feature 20: 2.357524
Feature 21: 2.345113
Feature 22: 1.716787
Feature 23: 0.881383
Feature 24: 0.428022
Feature 25: 1.707042
Feature 26: 0.318473
Feature 27: 0.333584
Feature 28: 0.402899
Feature 29: 1.089389
Feature 30: 1.171617
Feature 31: 0.009522
Feature 32: 0.713799
Feature 33: 0.360023


In [17]:
# Creating a logistic regression model using all features
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
# fit the model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
# evaluate the model
yhat = model.predict(X_test)
# evaluate predictions
accuracy = accuracy_score(y_test, yhat)
print('Accuracy: %.2f' % (accuracy*100))

Accuracy: 65.62


In [18]:
# Creating a SVM model using all features
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
# fit the model
clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
clf.fit(X_train, y_train)
# evaluate the model
yhat = clf.predict(X_test)
# evaluate predictions
accuracy = accuracy_score(y_test, yhat)
print('Accuracy: %.2f' % (accuracy*100))

Accuracy: 65.62


In [19]:
# Creating a RandomForest model using all features
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing
# fit the model
X_train_scaled = preprocessing.scale(X_train)
X_test_scaled = preprocessing.scale(X_test)
clf = RandomForestClassifier()
clf.fit(X_train_scaled, y_train)
# evaluate the model
yhat = clf.predict(X_test_scaled)
# evaluate predictions
accuracy = accuracy_score(y_test, yhat)
print('Accuracy: %.2f' % (accuracy*100))

Accuracy: 62.95


In [20]:
# Creating a GradientBoosting model using all features
from sklearn.ensemble import GradientBoostingClassifier
# fit the model
clf = GradientBoostingClassifier(random_state=0)
clf.fit(X_train, y_train)
# evaluate the model
yhat = clf.predict(X_test)
# evaluate predictions
accuracy = accuracy_score(y_test, yhat)
print('Accuracy: %.2f' % (accuracy*100))

Accuracy: 62.71


In [21]:
# Comparing results of different models
# As we can see Logistic regression and SVM gave the best accuracy of 65.62%
# Let's try to tune the SVM model

from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
# fit the model
clf = make_pipeline(StandardScaler(), SVC(kernel='linear'))
clf.fit(X_train, y_train)
# evaluate the model
yhat = clf.predict(X_test)
# evaluate predictions
accuracy = accuracy_score(y_test, yhat)
print('Accuracy: %.2f' % (accuracy*100))

# Changing the kernel to linear slightly improves the accuracy

Accuracy: 67.07


In [22]:
# Final Model
# Linear SVM classifier turns out to be the best model with an accuracy of 67.07%