In [1]:
# libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
# read the source data
dataset1 = pd.read_csv("../Data/survivor_employment_data.csv")
dataset2 = pd.read_csv("../Data/survivor_employment_data_additional.csv")

In [15]:
# find null rows
dataset1[dataset1.isnull().all(axis=1)]

Unnamed: 0,JOB_MATCH,S_NAME,S_AGE,S_GENDER,S_EDUCATION,S_UNDERSTANDING_COMM,S_GETTING_ALONG_PEOPLE,S_LIFE_ACT_HOUSE,S_LIFE_ACT_WORK,S_PARTICIPATE_SOCIETY,S_PREV_EMP,S_TOTAL_EXPERIENCE,JOB_DESCRIPTION,J_LANGUAGE,J_COMMUNICATION,J_PROB_SOLVE,J_ATTEN_VIGIL,J_ADAPT_INTERACT


In [14]:
# find null rows
dataset2[dataset2.isnull().all(axis=1)]

Unnamed: 0,JOB_MATCH,S_NAME,S_AGE,S_GENDER,S_EDUCATION,S_UNDERSTANDING_COMM,S_GETTING_ALONG_PEOPLE,S_LIFE_ACT_HOUSE,S_LIFE_ACT_WORK,S_PARTICIPATE_SOCIETY,S_PREV_EMP,S_TOTAL_EXPERIENCE,JOB_DESCRIPTION,J_LANGUAGE,J_COMMUNICATION,J_PROB_SOLVE,J_ATTEN_VIGIL,J_ADAPT_INTERACT


In [13]:
# remove null rows from dataset2
dataset2 = dataset2[~dataset2.isnull().all(axis=1)]

In [17]:
# combine the two datasets to get the master dataset
master_ds = pd.concat([dataset1, dataset2], axis=0, ignore_index=True)

In [18]:
master_ds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3176 entries, 0 to 3175
Data columns (total 18 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   JOB_MATCH               3176 non-null   object 
 1   S_NAME                  3176 non-null   object 
 2   S_AGE                   3176 non-null   float64
 3   S_GENDER                3176 non-null   object 
 4   S_EDUCATION             3176 non-null   object 
 5   S_UNDERSTANDING_COMM    3176 non-null   float64
 6   S_GETTING_ALONG_PEOPLE  3176 non-null   float64
 7   S_LIFE_ACT_HOUSE        3176 non-null   float64
 8   S_LIFE_ACT_WORK         3176 non-null   float64
 9   S_PARTICIPATE_SOCIETY   3176 non-null   float64
 10  S_PREV_EMP              3176 non-null   float64
 11  S_TOTAL_EXPERIENCE      3176 non-null   float64
 12  JOB_DESCRIPTION         3176 non-null   object 
 13  J_LANGUAGE              3176 non-null   object 
 14  J_COMMUNICATION         3176 non-null   

In [19]:
# create a copy of the master dataset
dataset = master_ds.copy()

In [21]:
# identify non-numeric (object) columns
dataset_obj = dataset.select_dtypes(object)

In [22]:
# this is just to consolidate and visualize 
# actual operations will still be performed on 'dataset'
dataset_obj

Unnamed: 0,JOB_MATCH,S_NAME,S_GENDER,S_EDUCATION,JOB_DESCRIPTION,J_LANGUAGE,J_COMMUNICATION,J_PROB_SOLVE,J_ATTEN_VIGIL,J_ADAPT_INTERACT
0,N,Padama Harbhajan Mogul,Female,Primary School,Marketing,Y,Y,N,N,N
1,N,Anusha Toor,Female,Primary School,Marketing,Y,Y,N,N,N
2,N,Vineeta Sushant Butala,Female,Primary School,Marketing,Y,Y,N,N,N
3,N,Hema Radhakrishnan,Female,Primary School,Marketing,Y,Y,N,N,N
4,N,Nupoor Deep Biyani,Female,Primary School,Marketing,Y,Y,N,N,N
...,...,...,...,...,...,...,...,...,...,...
3171,Y,AB Balambika ST,Female,Post Graduation,Executive,N,N,Y,N,N
3172,Y,AB Bagheswar ST,Female,Undergraduation,Executive,N,N,Y,N,N
3173,Y,AB Baghya La ST,Female,Undergraduation,Executive,N,N,Y,N,N
3174,Y,AB Triloka v ST,Female,Undergraduation,Executive,N,N,Y,N,N


In [39]:
dataset_obj.columns

Index(['JOB_MATCH', 'S_NAME', 'S_GENDER', 'S_EDUCATION', 'JOB_DESCRIPTION',
       'J_LANGUAGE', 'J_COMMUNICATION', 'J_PROB_SOLVE', 'J_ATTEN_VIGIL',
       'J_ADAPT_INTERACT'],
      dtype='object')

## Convert binary (Y/N) columns into numeric (1/0)

1. JOB_MATCH

In [25]:
dataset['JOB_MATCH'].value_counts()

Y    1642
N    1534
Name: JOB_MATCH, dtype: int64

In [27]:
job_match = pd.get_dummies(dataset['JOB_MATCH'], drop_first=True)
job_match.columns = ['JOB_MATCH']

In [29]:
job_match['JOB_MATCH'].value_counts()

1    1642
0    1534
Name: JOB_MATCH, dtype: int64

2. S_GENDER

In [31]:
dataset['S_GENDER'].value_counts()

Male      1615
Female    1558
u            3
Name: S_GENDER, dtype: int64

In [33]:
# replace 'u' values with 'Female
dataset.loc[dataset['S_GENDER'] == 'u']

Unnamed: 0,JOB_MATCH,S_NAME,S_AGE,S_GENDER,S_EDUCATION,S_UNDERSTANDING_COMM,S_GETTING_ALONG_PEOPLE,S_LIFE_ACT_HOUSE,S_LIFE_ACT_WORK,S_PARTICIPATE_SOCIETY,S_PREV_EMP,S_TOTAL_EXPERIENCE,JOB_DESCRIPTION,J_LANGUAGE,J_COMMUNICATION,J_PROB_SOLVE,J_ATTEN_VIGIL,J_ADAPT_INTERACT
2114,Y,Devika Aarif Dora C,34.0,u,High School,2.0,2.0,1.0,2.0,1.0,3.0,5.0,Marketing,Y,Y,N,N,N
2608,Y,AB Devika Aarif Dora C,34.0,u,High School,2.0,2.0,1.0,2.0,1.0,3.0,5.0,Marketing,Y,Y,N,N,N
3102,Y,AB Devika Aa ST,34.0,u,High School,2.0,2.0,1.0,2.0,1.0,3.0,5.0,Marketing,Y,Y,N,N,N


In [34]:
dataset.loc[dataset['S_GENDER'] == 'u', 'S_GENDER'] = 'Female'

In [35]:
# replaced
dataset.loc[dataset['S_GENDER'] == 'u']

Unnamed: 0,JOB_MATCH,S_NAME,S_AGE,S_GENDER,S_EDUCATION,S_UNDERSTANDING_COMM,S_GETTING_ALONG_PEOPLE,S_LIFE_ACT_HOUSE,S_LIFE_ACT_WORK,S_PARTICIPATE_SOCIETY,S_PREV_EMP,S_TOTAL_EXPERIENCE,JOB_DESCRIPTION,J_LANGUAGE,J_COMMUNICATION,J_PROB_SOLVE,J_ATTEN_VIGIL,J_ADAPT_INTERACT


In [36]:
dataset['S_GENDER'].value_counts()

Male      1615
Female    1561
Name: S_GENDER, dtype: int64

In [37]:
gender = pd.get_dummies(dataset['S_GENDER'], drop_first=True)
gender.columns = ['GENDER']

In [38]:
gender['GENDER'].value_counts()

1    1615
0    1561
Name: GENDER, dtype: int64

3. S_EDUCATION

In [41]:
dataset['S_EDUCATION'].value_counts()

High School        1331
Undergraduation     927
Primary School      778
Post Graduation     140
Name: S_EDUCATION, dtype: int64

In [42]:
# since this is not a binary classification, we wont drop first column
education = pd.get_dummies(dataset['S_EDUCATION'], drop_first=False)

In [43]:
education

Unnamed: 0,High School,Post Graduation,Primary School,Undergraduation
0,0,0,1,0
1,0,0,1,0
2,0,0,1,0
3,0,0,1,0
4,0,0,1,0
...,...,...,...,...
3171,0,1,0,0
3172,0,0,0,1
3173,0,0,0,1
3174,0,0,0,1


In [44]:
# rename the columns to indicate in a better way
education.columns = ['EDU_HSE', 'EDU_PG', 'EDU_PRIM', 'EDU_UG']

In [45]:
education

Unnamed: 0,EDU_HSE,EDU_PG,EDU_PRIM,EDU_UG
0,0,0,1,0
1,0,0,1,0
2,0,0,1,0
3,0,0,1,0
4,0,0,1,0
...,...,...,...,...
3171,0,1,0,0
3172,0,0,0,1
3173,0,0,0,1
3174,0,0,0,1


4. JOB_DESCRIPTION

In [47]:
dataset['JOB_DESCRIPTION'].value_counts()

BPO                               720
Marketing                         599
Super Market Shelf Maintenance    506
Teacher                           500
Executive                         479
Security Guard                    372
Name: JOB_DESCRIPTION, dtype: int64

In [48]:
# since this is not a binary classification, we wont drop first column
job_desc = pd.get_dummies(dataset['JOB_DESCRIPTION'], drop_first=False)

In [49]:
job_desc

Unnamed: 0,BPO,Executive,Marketing,Security Guard,Super Market Shelf Maintenance,Teacher
0,0,0,1,0,0,0
1,0,0,1,0,0,0
2,0,0,1,0,0,0
3,0,0,1,0,0,0
4,0,0,1,0,0,0
...,...,...,...,...,...,...
3171,0,1,0,0,0,0
3172,0,1,0,0,0,0
3173,0,1,0,0,0,0
3174,0,1,0,0,0,0


In [50]:
job_desc.columns = ['JOB_BPO', 'JOB_EXEC', 'JOB_MRKT', 'JOB_SECG', 'JOB_SUPM', 'JOB_TCHR']

In [51]:
job_desc

Unnamed: 0,JOB_BPO,JOB_EXEC,JOB_MRKT,JOB_SECG,JOB_SUPM,JOB_TCHR
0,0,0,1,0,0,0
1,0,0,1,0,0,0
2,0,0,1,0,0,0
3,0,0,1,0,0,0
4,0,0,1,0,0,0
...,...,...,...,...,...,...
3171,0,1,0,0,0,0
3172,0,1,0,0,0,0
3173,0,1,0,0,0,0
3174,0,1,0,0,0,0


5. J_LANGUAGE

In [53]:
dataset['J_LANGUAGE'].value_counts()

N    2577
Y     599
Name: J_LANGUAGE, dtype: int64

In [54]:
# convert the language values to a single numeric column
lang = pd.get_dummies(dataset['J_LANGUAGE'], drop_first=True)

In [55]:
lang

Unnamed: 0,Y
0,1
1,1
2,1
3,1
4,1
...,...
3171,0
3172,0
3173,0
3174,0


In [56]:
lang.columns = ['LANGUAGE']

In [57]:
lang

Unnamed: 0,LANGUAGE
0,1
1,1
2,1
3,1
4,1
...,...
3171,0
3172,0
3173,0
3174,0


6. J_COMMUNICATION

In [60]:
dataset['J_COMMUNICATION'].value_counts()

N    2577
Y     599
Name: J_COMMUNICATION, dtype: int64

In [61]:
# convert the values to numeric
comm = pd.get_dummies(dataset['J_COMMUNICATION'], drop_first=True)

In [62]:
comm

Unnamed: 0,Y
0,1
1,1
2,1
3,1
4,1
...,...
3171,0
3172,0
3173,0
3174,0


In [63]:
comm.columns = ['COMMUNICATION']

In [64]:
comm

Unnamed: 0,COMMUNICATION
0,1
1,1
2,1
3,1
4,1
...,...
3171,0
3172,0
3173,0
3174,0


7. J_PROB_SOLVE

In [66]:
dataset['J_PROB_SOLVE'].value_counts()

N    2197
Y     979
Name: J_PROB_SOLVE, dtype: int64

In [67]:
# convert the values to numeric
prob = pd.get_dummies(dataset['J_PROB_SOLVE'], drop_first=True)

In [68]:
prob

Unnamed: 0,Y
0,0
1,0
2,0
3,0
4,0
...,...
3171,1
3172,1
3173,1
3174,1


In [69]:
prob.columns = ['PROB_SOLVE']

In [70]:
prob

Unnamed: 0,PROB_SOLVE
0,0
1,0
2,0
3,0
4,0
...,...
3171,1
3172,1
3173,1
3174,1


7. J_ATTEN_VIGIL

In [72]:
dataset['J_ATTEN_VIGIL'].value_counts()

N    2084
Y    1092
Name: J_ATTEN_VIGIL, dtype: int64

In [73]:
# convert the values to numeric
atten = pd.get_dummies(dataset['J_ATTEN_VIGIL'], drop_first=True)

In [74]:
atten

Unnamed: 0,Y
0,0
1,0
2,0
3,0
4,0
...,...
3171,0
3172,0
3173,0
3174,0


In [75]:
atten.columns = ['ATTEN_VIGIL']

In [76]:
atten

Unnamed: 0,ATTEN_VIGIL
0,0
1,0
2,0
3,0
4,0
...,...
3171,0
3172,0
3173,0
3174,0


8. J_ADAPT_INTERACT

In [78]:
dataset['J_ADAPT_INTERACT'].value_counts()

N    2670
Y     506
Name: J_ADAPT_INTERACT, dtype: int64

In [79]:
# convert the values to numeric
adapt = pd.get_dummies(dataset['J_ADAPT_INTERACT'], drop_first=True)

In [80]:
adapt.columns = ['ADAPT_INTERACT']

In [81]:
adapt

Unnamed: 0,ADAPT_INTERACT
0,0
1,0
2,0
3,0
4,0
...,...
3171,0
3172,0
3173,0
3174,0


In [82]:
# precautionary copy of dataset
dataset_copy = dataset.copy()

In [83]:
# drop object columns from dataset
dataset.drop(['JOB_MATCH', 'S_NAME', 'S_GENDER', 'S_EDUCATION', 'JOB_DESCRIPTION', 'J_LANGUAGE', 'J_COMMUNICATION', 'J_PROB_SOLVE', 'J_ATTEN_VIGIL', 'J_ADAPT_INTERACT'], axis=1, inplace=True)

In [84]:
dataset

Unnamed: 0,S_AGE,S_UNDERSTANDING_COMM,S_GETTING_ALONG_PEOPLE,S_LIFE_ACT_HOUSE,S_LIFE_ACT_WORK,S_PARTICIPATE_SOCIETY,S_PREV_EMP,S_TOTAL_EXPERIENCE
0,47.0,4.0,1.0,1.0,3.0,3.0,2.0,6.0
1,40.0,4.0,3.0,1.0,3.0,3.0,2.0,7.0
2,27.0,4.0,3.0,1.0,3.0,3.0,1.0,2.0
3,26.0,4.0,3.0,1.0,3.0,3.0,1.0,2.0
4,48.0,3.0,3.0,1.0,3.0,2.0,1.0,6.0
...,...,...,...,...,...,...,...,...
3171,24.0,2.0,1.0,2.0,1.0,1.0,4.0,2.0
3172,25.0,1.0,2.0,1.0,1.0,1.0,4.0,3.0
3173,24.0,1.0,2.0,1.0,1.0,1.0,4.0,3.0
3174,37.0,1.0,2.0,1.0,1.0,2.0,4.0,8.0


In [85]:
# concat the nre derived columns
dataset_new = pd.concat([dataset, job_match, gender, education, job_desc, lang, comm, prob, atten, adapt], axis=1)

In [86]:
dataset_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3176 entries, 0 to 3175
Data columns (total 25 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   S_AGE                   3176 non-null   float64
 1   S_UNDERSTANDING_COMM    3176 non-null   float64
 2   S_GETTING_ALONG_PEOPLE  3176 non-null   float64
 3   S_LIFE_ACT_HOUSE        3176 non-null   float64
 4   S_LIFE_ACT_WORK         3176 non-null   float64
 5   S_PARTICIPATE_SOCIETY   3176 non-null   float64
 6   S_PREV_EMP              3176 non-null   float64
 7   S_TOTAL_EXPERIENCE      3176 non-null   float64
 8   JOB_MATCH               3176 non-null   uint8  
 9   GENDER                  3176 non-null   uint8  
 10  EDU_HSE                 3176 non-null   uint8  
 11  EDU_PG                  3176 non-null   uint8  
 12  EDU_PRIM                3176 non-null   uint8  
 13  EDU_UG                  3176 non-null   uint8  
 14  JOB_BPO                 3176 non-null   

In [89]:
dataset_new.head()

Unnamed: 0,S_AGE,S_UNDERSTANDING_COMM,S_GETTING_ALONG_PEOPLE,S_LIFE_ACT_HOUSE,S_LIFE_ACT_WORK,S_PARTICIPATE_SOCIETY,S_PREV_EMP,S_TOTAL_EXPERIENCE,JOB_MATCH,GENDER,...,JOB_EXEC,JOB_MRKT,JOB_SECG,JOB_SUPM,JOB_TCHR,LANGUAGE,COMMUNICATION,PROB_SOLVE,ATTEN_VIGIL,ADAPT_INTERACT
0,47.0,4.0,1.0,1.0,3.0,3.0,2.0,6.0,0,0,...,0,1,0,0,0,1,1,0,0,0
1,40.0,4.0,3.0,1.0,3.0,3.0,2.0,7.0,0,0,...,0,1,0,0,0,1,1,0,0,0
2,27.0,4.0,3.0,1.0,3.0,3.0,1.0,2.0,0,0,...,0,1,0,0,0,1,1,0,0,0
3,26.0,4.0,3.0,1.0,3.0,3.0,1.0,2.0,0,0,...,0,1,0,0,0,1,1,0,0,0
4,48.0,3.0,3.0,1.0,3.0,2.0,1.0,6.0,0,0,...,0,1,0,0,0,1,1,0,0,0


In [90]:
# dataset looks good to me overridden
dataset = dataset_new

# MODELING

In [91]:
# seperating the features from the response column (JOB_MATCH)
data_X = dataset.drop('JOB_MATCH', axis=1)
data_y = dataset['JOB_MATCH']

In [92]:
data_X.head()

Unnamed: 0,S_AGE,S_UNDERSTANDING_COMM,S_GETTING_ALONG_PEOPLE,S_LIFE_ACT_HOUSE,S_LIFE_ACT_WORK,S_PARTICIPATE_SOCIETY,S_PREV_EMP,S_TOTAL_EXPERIENCE,GENDER,EDU_HSE,...,JOB_EXEC,JOB_MRKT,JOB_SECG,JOB_SUPM,JOB_TCHR,LANGUAGE,COMMUNICATION,PROB_SOLVE,ATTEN_VIGIL,ADAPT_INTERACT
0,47.0,4.0,1.0,1.0,3.0,3.0,2.0,6.0,0,0,...,0,1,0,0,0,1,1,0,0,0
1,40.0,4.0,3.0,1.0,3.0,3.0,2.0,7.0,0,0,...,0,1,0,0,0,1,1,0,0,0
2,27.0,4.0,3.0,1.0,3.0,3.0,1.0,2.0,0,0,...,0,1,0,0,0,1,1,0,0,0
3,26.0,4.0,3.0,1.0,3.0,3.0,1.0,2.0,0,0,...,0,1,0,0,0,1,1,0,0,0
4,48.0,3.0,3.0,1.0,3.0,2.0,1.0,6.0,0,0,...,0,1,0,0,0,1,1,0,0,0


In [93]:
data_y.head()

0    0
1    0
2    0
3    0
4    0
Name: JOB_MATCH, dtype: uint8

## Splitting the dataset into training and testing sets

In [94]:
# split the features and response using train_test_split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data_X, data_y, test_size=0.2, random_state=42)

## Training and Predicting

1. LogisticRegression

In [98]:
from sklearn.linear_model import LogisticRegression

In [99]:
model_lr = LogisticRegression()
model_lr.fit(X_train, y_train)

LogisticRegression()

In [101]:
# making predictions
predictions = model_lr.predict(X_test)

In [102]:
# evaluate the model
from sklearn.metrics import confusion_matrix

In [103]:
matrix = confusion_matrix(y_test, predictions)

In [104]:
matrix

array([[289,  36],
       [ 25, 286]])

In [105]:
# calculate accuracy score
from sklearn.metrics import accuracy_score

In [106]:
accuracy = accuracy_score(y_test, predictions)

In [107]:
accuracy

0.9040880503144654

In [108]:
# calcuating additional evaluation parameters
from sklearn.metrics import classification_report

In [110]:
print(classification_report(y_test, predictions))

precision    recall  f1-score   support

           0       0.92      0.89      0.90       325
           1       0.89      0.92      0.90       311

    accuracy                           0.90       636
   macro avg       0.90      0.90      0.90       636
weighted avg       0.90      0.90      0.90       636

