In [624]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, MinMaxScaler
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectKBest,chi2
from sklearn import set_config

In [625]:
set_config(display='diagram')

In [626]:
df=pd.read_csv("./datasets/data_science_job.csv")

In [627]:
df.sample(10)

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,training_hours,target
5984,11417,city_21,0.624,,No relevent experience,Full time course,Masters,STEM,0.0,100-500,Pvt Ltd,20.0,0.0
4688,28417,city_103,0.92,Female,Has relevent experience,Full time course,Graduate,STEM,7.0,50-99,NGO,157.0,1.0
16633,238,city_16,0.91,,Has relevent experience,no_enrollment,Graduate,STEM,6.0,100-500,Pvt Ltd,31.0,0.0
11906,3033,city_67,0.855,,Has relevent experience,Full time course,Masters,STEM,12.0,,,91.0,0.0
5491,3157,city_103,0.92,Male,Has relevent experience,no_enrollment,Masters,Humanities,5.0,50-99,Pvt Ltd,198.0,0.0
17894,8741,city_103,0.92,Male,Has relevent experience,no_enrollment,Graduate,STEM,20.0,50-99,Pvt Ltd,19.0,0.0
12691,29684,city_99,0.915,,Has relevent experience,no_enrollment,Graduate,STEM,2.0,5000-9999,Pvt Ltd,30.0,0.0
11543,31324,city_128,0.527,,Has relevent experience,no_enrollment,Graduate,STEM,7.0,100-500,Pvt Ltd,63.0,0.0
8976,28908,city_103,0.92,Male,Has relevent experience,no_enrollment,Graduate,STEM,16.0,50-99,Pvt Ltd,250.0,0.0
6143,5618,city_11,0.55,Male,Has relevent experience,Full time course,Graduate,STEM,4.0,,,51.0,0.0


In [628]:
df = df.drop('city', axis=1)


In [629]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19158 entries, 0 to 19157
Data columns (total 12 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   enrollee_id             19158 non-null  int64  
 1   city_development_index  18679 non-null  float64
 2   gender                  14650 non-null  object 
 3   relevent_experience     19158 non-null  object 
 4   enrolled_university     18772 non-null  object 
 5   education_level         18698 non-null  object 
 6   major_discipline        16345 non-null  object 
 7   experience              19093 non-null  float64
 8   company_size            13220 non-null  object 
 9   company_type            13018 non-null  object 
 10  training_hours          18392 non-null  float64
 11  target                  19158 non-null  float64
dtypes: float64(4), int64(1), object(7)
memory usage: 1.8+ MB


In [630]:
df["gender"]

0        Male
1        Male
2         NaN
3         NaN
4        Male
         ... 
19153    Male
19154    Male
19155    Male
19156    Male
19157     NaN
Name: gender, Length: 19158, dtype: object

In [631]:
col=[]
for c in df.columns:
    if df[c].isnull().mean()<0.05 and df[c].isnull().mean()>0:
        col.append(c)

In [632]:
col

['city_development_index',
 'enrolled_university',
 'education_level',
 'experience',
 'training_hours']

In [633]:
new_df=df.dropna(subset=col)


In [634]:
df.shape,new_df.shape

((19158, 12), (17182, 12))

In [635]:
new_df.head()

Unnamed: 0,enrollee_id,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,training_hours,target
0,8949,0.92,Male,Has relevent experience,no_enrollment,Graduate,STEM,20.0,,,36.0,1.0
1,29725,0.776,Male,No relevent experience,no_enrollment,Graduate,STEM,15.0,50-99,Pvt Ltd,47.0,0.0
2,11561,0.624,,No relevent experience,Full time course,Graduate,STEM,5.0,,,83.0,0.0
4,666,0.767,Male,Has relevent experience,no_enrollment,Masters,STEM,20.0,50-99,Funded Startup,8.0,0.0
5,21651,0.764,,Has relevent experience,Part time course,Graduate,STEM,11.0,,,24.0,1.0


# Data analysis in case of CCAR

In [636]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17182 entries, 0 to 19157
Data columns (total 12 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   enrollee_id             17182 non-null  int64  
 1   city_development_index  17182 non-null  float64
 2   gender                  13309 non-null  object 
 3   relevent_experience     17182 non-null  object 
 4   enrolled_university     17182 non-null  object 
 5   education_level         17182 non-null  object 
 6   major_discipline        15029 non-null  object 
 7   experience              17182 non-null  float64
 8   company_size            12064 non-null  object 
 9   company_type            11890 non-null  object 
 10  training_hours          17182 non-null  float64
 11  target                  17182 non-null  float64
dtypes: float64(4), int64(1), object(7)
memory usage: 1.7+ MB


In [637]:
new_df.isnull().sum()
# city ,relevent_experience,enrolled_university,education_level

enrollee_id                  0
city_development_index       0
gender                    3873
relevent_experience          0
enrolled_university          0
education_level              0
major_discipline          2153
experience                   0
company_size              5118
company_type              5292
training_hours               0
target                       0
dtype: int64

In [638]:
df["relevent_experience"].value_counts(),df["relevent_experience"].nunique()
# ["No relevent experience","Has relevent experience"],4

(Has relevent experience    13792
 No relevent experience      5366
 Name: relevent_experience, dtype: int64,
 2)

In [639]:
df["enrolled_university"].value_counts(),df["enrolled_university"].nunique()
# ["no_enrollment,Part time course","Full time course"],5

(no_enrollment       13817
 Full time course     3757
 Part time course     1198
 Name: enrolled_university, dtype: int64,
 3)

In [640]:
df["education_level"].value_counts(),df["education_level"].nunique()
# ["Primary School","High School","Graduate","Masters","Phd"], 6

(Graduate          11598
 Masters            4361
 High School        2017
 Phd                 414
 Primary School      308
 Name: education_level, dtype: int64,
 5)

"gender"  --> one_hot_coding

"major_discipline"  -->ordinal encoding

"company_size"   -->  ordinal encoding

"company_type"  ---> ordinal_encoding

"relevent_experience" -->ordinal_encoding

"enrolled_university"  -->ordinal encoding
"education_level"      --> ordinal encoding



In [641]:
new_df["gender"].value_counts(), df["gender"].nunique()

(Male      12006
 Female     1142
 Other       161
 Name: gender, dtype: int64,
 3)

In [642]:
new_df["major_discipline"].value_counts(),df["major_discipline"].nunique()
# "STEM","Humanities","Business Degree","Arts","Other","No Major"

(STEM               13314
 Humanities           617
 Other                347
 Business Degree      308
 Arts                 234
 No Major             209
 Name: major_discipline, dtype: int64,
 6)

In [643]:
new_df["company_size"].value_counts(), df["company_size"].nunique()
# "<10","10/49","50-99","100-500","500-999","1000-4999","5000-9999","10000+"

(50-99        2802
 100-500      2330
 10000+       1863
 10/49        1342
 1000-4999    1215
 <10          1185
 500-999       807
 5000-9999     520
 Name: company_size, dtype: int64,
 8)

In [644]:
new_df["company_type"].value_counts(), df["company_type"].nunique()
# "Pvt Ltd","Public Sector","Funded Startup","Early Stage Startup","NGO","Other"

(Pvt Ltd                8970
 Funded Startup          922
 Public Sector           867
 Early Stage Startup     552
 NGO                     470
 Other                   109
 Name: company_type, dtype: int64,
 6)

# Train with Complete Case At Random(CCAR)

In [645]:
X_train,X_test,y_train,y_test=train_test_split(new_df.iloc[:,:11],new_df.iloc[:,11:12])

In [646]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((12886, 11), (4296, 11), (12886, 1), (4296, 1))

In [647]:
X_train.columns

Index(['enrollee_id', 'city_development_index', 'gender',
       'relevent_experience', 'enrolled_university', 'education_level',
       'major_discipline', 'experience', 'company_size', 'company_type',
       'training_hours'],
      dtype='object')

In [648]:
X_train.isnull().sum()

enrollee_id                  0
city_development_index       0
gender                    2935
relevent_experience          0
enrolled_university          0
education_level              0
major_discipline          1603
experience                   0
company_size              3918
company_type              4040
training_hours               0
dtype: int64

In [649]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12886 entries, 10387 to 2771
Data columns (total 11 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   enrollee_id             12886 non-null  int64  
 1   city_development_index  12886 non-null  float64
 2   gender                  9951 non-null   object 
 3   relevent_experience     12886 non-null  object 
 4   enrolled_university     12886 non-null  object 
 5   education_level         12886 non-null  object 
 6   major_discipline        11283 non-null  object 
 7   experience              12886 non-null  float64
 8   company_size            8968 non-null   object 
 9   company_type            8846 non-null   object 
 10  training_hours          12886 non-null  float64
dtypes: float64(3), int64(1), object(7)
memory usage: 1.2+ MB


In [650]:
X_train["enrollee_id"].nunique()

12886

In [651]:
X_train.describe()

Unnamed: 0,enrollee_id,city_development_index,experience,training_hours
count,12886.0,12886.0,12886.0,12886.0
mean,16864.485876,0.831322,10.060764,65.174841
std,9627.871573,0.122056,6.491004,59.747254
min,1.0,0.448,0.0,1.0
25%,8536.25,0.743,5.0,23.0
50%,16973.5,0.91,9.0,47.0
75%,25188.5,0.92,16.0,89.0
max,33380.0,0.949,20.0,336.0


In [652]:
y_train.isnull().sum()

target    0
dtype: int64

In [653]:
trf1=ColumnTransformer(transformers=[
    ('impute',SimpleImputer(strategy="most_frequent"),[2,6,8,9])
]
,remainder="passthrough")

In [654]:
trf2=ColumnTransformer(transformers=[
    ('gender',OneHotEncoder(sparse=False,handle_unknown="ignore"),[2]),
#     ('major_discipline_company_size_company_type', OrdinalEncoder(categories=[["STEM","Humanities","Business Degree","Arts","Other","No Major"],
#                                                                               ["<10","10/49","50-99","100-500","500-999","1000-4999","5000-9999","10000+"],
#                                                                               ["Pvt Ltd","Public Sector","Funded Startup","Early Stage Startup","NGO","Other"],
#                                                                               ["No relevent experience","Has relevent experience"],
#                                                                               ["no_enrollment","Part time course","Full time course"],
#                                                                               ["Primary School","High School","Graduate","Masters","Phd"]],handle_unknown="use_encoded_value",unknown_value=-1)
#                                                                                 ,[6,8,9,3,4,5])
    ('major_discipline', OrdinalEncoder(categories=[["STEM","Humanities","Business Degree","Arts","Other","No Major"]],handle_unknown='use_encoded_value', unknown_value=-1),[6]),
    ('company_size', OrdinalEncoder(categories=[["<10","10/49","50-99","100-500","500-999","1000-4999","5000-9999","10000+"]],handle_unknown='use_encoded_value', unknown_value=-1),[8]),
    ('company_type', OrdinalEncoder(categories=[['Pvt Ltd','Public Sector','Funded Startup','Early Stage Startup','NGO","Other']],handle_unknown='use_encoded_value', unknown_value=-1),[9]),
    ('relevent_experience',OrdinalEncoder(categories=[["No relevent experience","Has relevent experience"]],handle_unknown='use_encoded_value', unknown_value=-1),[3]),
    ('enrolled_university', OrdinalEncoder(categories=[["no_enrollment","Part time course","Full time course"]],handle_unknown='use_encoded_value', unknown_value=-1),[4]),
    ('education_level', OrdinalEncoder(categories=[["Primary School","High School","Graduate","Masters","Phd"]],handle_unknown='use_encoded_value', unknown_value=-1),[5])
],

remainder="passthrough")

In [655]:
# trf2=ColumnTransformer(transformers=[
#     ('gender',OneHotEncoder(sparse=False,handle_unknown="ignore"),[2]),
#     ('major_discipline_company_size_company_type', OrdinalEncoder(handle_unknown="use_encoded_value",unknown_value=-1)
#                                                                                 ,[6,8,9,3,4,5])
# #     ('major_discipline', OrdinalEncoder(categories=[["STEM","Humanities","Business Degree","Arts","Other","No Major"]],handle_unknown='use_encoded_value', unknown_value=-1),[6]),
# #     ('company_size', OrdinalEncoder(categories=[["<10","10/49","50-99","100-500","500-999","1000-4999","5000-9999","10000+"]],handle_unknown='use_encoded_value', unknown_value=-1),[8]),
# #     ('company_type', OrdinalEncoder(categories=[['Pvt Ltd','Public Sector','Funded Startup','Early Stage Startup','NGO","Other']],handle_unknown='use_encoded_value', unknown_value=-1),[9]),
# #     ('relevent_experience',OrdinalEncoder(categories=[["No relevent experience","Has relevent experience"]],handle_unknown='use_encoded_value', unknown_value=-1),[3]),
# #     ('enrolled_university', OrdinalEncoder(categories=[["no_enrollment","Part time course","Full time course"]],handle_unknown='use_encoded_value', unknown_value=-1),[4]),
# #     ('education_level', OrdinalEncoder(categories=[["Primary School","High School","Graduate","Masters","Phd"]],handle_unknown='use_encoded_value', unknown_value=-1),[5])
# ],

# remainder="passthrough")

In [656]:
trf3=ColumnTransformer(transformers=[
    ('scale',MinMaxScaler(),slice(0,13))
])

In [657]:
trf4=SelectKBest(score_func=chi2,k=5)

In [658]:
trf5=DecisionTreeClassifier()

In [659]:
pipe=Pipeline([('trf1',trf1),('trf2',trf2),('trf3',trf3),('trf4',trf4),('trf5',trf5)])

In [660]:
pipe.fit(X_train,y_train)

In [661]:
y_pred=pipe.predict(X_test)

In [662]:
accuracy_score(y_pred,y_test)

0.7553538175046555

# Cross_validation_score

In [663]:
cross_val_score(pipe,X_train,y_train,cv=4,scoring="accuracy").mean()

0.7505820450870307

# GridSearch using pipeline

In [664]:
params={
    'trf5__max_depth':[1,2,3,4,5,None]
}

In [665]:
from sklearn.model_selection import GridSearchCV
grid=GridSearchCV(pipe,params,cv=5,scoring='accuracy')
grid.fit(X_train,y_train)

In [666]:
grid.best_score_

0.7505820270200705

In [667]:
grid.best_params_

{'trf5__max_depth': 1}