In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('aug_train.csv')

In [3]:
data.head()

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
0,8949,city_103,0.92,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,,,1,36,1.0
1,29725,city_40,0.776,Male,No relevent experience,no_enrollment,Graduate,STEM,15,50-99,Pvt Ltd,>4,47,0.0
2,11561,city_21,0.624,,No relevent experience,Full time course,Graduate,STEM,5,,,never,83,0.0
3,33241,city_115,0.789,,No relevent experience,,Graduate,Business Degree,<1,,Pvt Ltd,never,52,1.0
4,666,city_162,0.767,Male,Has relevent experience,no_enrollment,Masters,STEM,>20,50-99,Funded Startup,4,8,0.0


In [4]:

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19158 entries, 0 to 19157
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   enrollee_id             19158 non-null  int64  
 1   city                    19158 non-null  object 
 2   city_development_index  19158 non-null  float64
 3   gender                  14650 non-null  object 
 4   relevent_experience     19158 non-null  object 
 5   enrolled_university     18772 non-null  object 
 6   education_level         18698 non-null  object 
 7   major_discipline        16345 non-null  object 
 8   experience              19093 non-null  object 
 9   company_size            13220 non-null  object 
 10  company_type            13018 non-null  object 
 11  last_new_job            18735 non-null  object 
 12  training_hours          19158 non-null  int64  
 13  target                  19158 non-null  float64
dtypes: float64(2), int64(2), object(10)
me

In [5]:
data.isna().any()

enrollee_id               False
city                      False
city_development_index    False
gender                     True
relevent_experience       False
enrolled_university        True
education_level            True
major_discipline           True
experience                 True
company_size               True
company_type               True
last_new_job               True
training_hours            False
target                    False
dtype: bool

In [6]:
data.shape

(19158, 14)

## Dealing with missing data..

In [7]:
# Imputing gender with Mode of data..
genderMode = data['gender'].mode()
data.gender.fillna(genderMode[0], inplace = True)
data.gender.isna().any()

False

In [8]:
# Imputing enrolled University with mode as it is also a catagorical variable...
universityStatus = data.enrolled_university.mode()
data['enrolled_university'].fillna(universityStatus[0], inplace = True)
data.enrolled_university.isna().any()


False

In [9]:
educationLevel = data.education_level.mode()
data.education_level.fillna(educationLevel[0], inplace = True)
data.education_level.isna().any()

False

In [10]:
major = data.major_discipline.mode()
data.major_discipline.fillna(major[0], inplace = True)

In [11]:
# Imputing experience with Median as it is a discrete variable...
data['experience'].replace({'>20':20, '<1':0}, inplace = True)
expMedian = data.experience.median()
expMedian
data['experience'].fillna(expMedian, inplace = True)

In [12]:
sizeMode = data.company_size.mode()
data['company_size'].fillna(sizeMode[0], inplace=True)

In [13]:
typeMode = data.company_type.mode()
data['company_type'].fillna(typeMode[0], inplace = True)

In [14]:
data.last_new_job.unique()
data.replace({'>4':5, 'never':0}, inplace = True)
lastNewMedian = data.last_new_job.median()
data.last_new_job.fillna(lastNewMedian, inplace = True)

In [15]:
data.isna().any()

enrollee_id               False
city                      False
city_development_index    False
gender                    False
relevent_experience       False
enrolled_university       False
education_level           False
major_discipline          False
experience                False
company_size              False
company_type              False
last_new_job              False
training_hours            False
target                    False
dtype: bool

In [16]:
data.dtypes

enrollee_id                 int64
city                       object
city_development_index    float64
gender                     object
relevent_experience        object
enrolled_university        object
education_level            object
major_discipline           object
experience                 object
company_size               object
company_type               object
last_new_job               object
training_hours              int64
target                    float64
dtype: object

In [17]:
data.city = data.city.map(lambda city: city.strip('city_'))
data.city = data.city.astype(int)

In [18]:
from sklearn.preprocessing import LabelEncoder

In [19]:
# Label Encoding Gender Column F = 0, M = 1, Other = 2
encoder = LabelEncoder()
data['gender'] = encoder.fit_transform(data['gender'])
data.gender.unique()

array([1, 0, 2])

In [20]:
# Has Relevent Experience = 0, not having Relevant Experience = 1
data['relevent_experience'] = encoder.fit_transform(data['relevent_experience'])


In [21]:
# 'no_enrollment' = 2, 'Full time course' =0, 'Part time course = 1'
data['enrolled_university'] =encoder.fit_transform(data['enrolled_university'])

In [22]:
# 'Graduate' = 0, 'Masters' = 2, 'High School' = 1, 'Phd' = 3, 'Primary School' = 4
data['education_level'] = encoder.fit_transform(data['education_level'])
data['education_level'].unique()

array([0, 2, 1, 3, 4])

In [23]:
# 'STEM' = 5, 'Business Degree' = 1, 'Arts' = 0, 'Humanities' = 2, 'No Major' = 3, 'Other' = 4
data['major_discipline'] = encoder.fit_transform(data['major_discipline'])
data['major_discipline'].unique()

array([5, 1, 0, 2, 3, 4])

In [24]:
data['experience']=data['experience'].astype(int)

In [25]:
# '50-99'=4 '<10'=7 '10000+'=3 '5000-9999'=6 '1000-4999'=2 '10/49'=0 '100-500'=1, '500-999'=5
data['company_size'] = encoder.fit_transform(data['company_size'])
data['company_size'].unique()

array([4, 7, 3, 6, 2, 0, 1, 5])

In [26]:
data['company_type'] = encoder.fit_transform(data['company_type'])

In [27]:
data['last_new_job'] = data['last_new_job'].astype(int)

In [28]:
data['target'] = data['target'].astype(int)

In [29]:
data.dtypes

enrollee_id                 int64
city                        int64
city_development_index    float64
gender                      int64
relevent_experience         int64
enrolled_university         int64
education_level             int64
major_discipline            int64
experience                  int64
company_size                int64
company_type                int64
last_new_job                int64
training_hours              int64
target                      int64
dtype: object

# Creating the machine Learning model..

In [30]:
features = data.drop('target', axis = 1).values
target = data['target'].values

In [31]:
from sklearn.preprocessing import OneHotEncoder
oneHot = OneHotEncoder(sparse=False)
oneHotData = oneHot.fit_transform(features)

In [32]:
from sklearn.model_selection import train_test_split
xTrain,  xTest, yTrain, yTest = train_test_split(oneHotData,target)

In [33]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(fit_intercept=True, max_iter=500)
model.fit(xTrain,yTrain)

LogisticRegression(max_iter=500)

In [34]:
predictions = model.predict(xTest)

In [35]:
from sklearn.metrics import confusion_matrix
confusion_matrix(predictions,yTest)

array([[3324,  786],
       [ 297,  383]])

In [36]:
from sklearn.metrics import accuracy_score
accuracy_score(predictions, yTest)

In [None]:
from sklearn.naive_bayes import MultinomialNB
nbModel = MultinomialNB()
nbModel.fit(xTrain,yTrain)
predictions = nbModel.predict(xTest)
accuracy_score(predictions, yTest)

0.7594989561586639

In [None]:
from sklearn.tree import DecisionTreeClassifier
treeModel = DecisionTreeClassifier()
treeModel.fit(xTrain,yTrain)
predictions = treeModel.predict(xTest)
accuracy_score(predictions,yTest)

0.7398747390396659

In [None]:
print(model.score(xTrain,yTrain))
print(model.score(xTest,yTest))

0.8537722717149221
0.7638830897703549


In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=6)
knn.fit(xTrain,yTrain)
predictions = knn.predict(xTest)
print(knn.score(xTest,yTest))
print(knn.score(xTest,yTest))

0.7536534446764092
0.7536534446764092


In [None]:
treeModel.fit(xTrain, yTrain)
predictions = treeModel.predict(xTest)
print(accuracy_score(predictions,yTest))
print(treeModel.score(xTrain,yTrain))
print(treeModel.score(xTest,yTest))

0.7438413361169103
1.0
0.7438413361169103


In [37]:
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_jobs=1,)
forest.fit(xTrain,yTrain)
predictions = forest.predict(xTest)
print(accuracy_score(predictions, yTest))
print(forest.score(xTrain,yTrain))
print(forest.score(xTest,yTest))

In [38]:

n_estimators = [500,800,1000,1500,2000]
max_depth = [10,20,30,40,50,60,70,80]
max_depth.append(None)
min_samples_split = [2,5,10,15,20]
min_samples_leaf = [1,2,5,10,15,20]

In [39]:
grid_params = {'n_estimators':n_estimators,
'max_depth':max_depth,
'min_samples_leaf':min_samples_leaf,
'min_samples_split':min_samples_split,}


In [None]:
from sklearn.model_selection import RandomizedSearchCV
search = RandomizedSearchCV(estimator=forest, param_distributions=grid_params, n_iter=500,cv=5, verbose=2, n_jobs=1)
search.fit(xTrain,yTrain)

[CV]  n_estimators=2000, min_samples_split=20, min_samples_leaf=10, max_depth=60, total= 4.8min
[CV] n_estimators=2000, min_samples_split=20, min_samples_leaf=10, max_depth=60 
[CV]  n_estimators=2000, min_samples_split=20, min_samples_leaf=10, max_depth=60, total= 4.6min
[CV] n_estimators=2000, min_samples_split=15, min_samples_leaf=2, max_depth=10 
[CV]  n_estimators=2000, min_samples_split=15, min_samples_leaf=2, max_depth=10, total= 6.2min
[CV] n_estimators=2000, min_samples_split=15, min_samples_leaf=2, max_depth=10 
[CV]  n_estimators=2000, min_samples_split=15, min_samples_leaf=2, max_depth=10, total= 6.3min
[CV] n_estimators=2000, min_samples_split=15, min_samples_leaf=2, max_depth=10 
[CV]  n_estimators=2000, min_samples_split=15, min_samples_leaf=2, max_depth=10, total= 6.3min
[CV] n_estimators=2000, min_samples_split=15, min_samples_leaf=2, max_depth=10 
[CV]  n_estimators=2000, min_samples_split=15, min_samples_leaf=2, max_depth=10, total= 5.4min
[CV] n_estimators=2000, min

In [None]:
print(search.best_estimator_)
print(search.best_params_)
print(search.best_score_)
print(search.cv_results_())
print(search.decision_function)
print(search.decision_function)