In [81]:
# import the modules

import pandas as pd
import numpy as np
import os 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import r2_score, roc_auc_score, accuracy_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler


### Load the dataset

- Load the train data and using all your knowledge try to explore the different statistical properties of the dataset.

In [82]:
# Code starts here
train = pd.read_csv('train.csv')
print(train.shape)
#train.head()

(1281, 36)


In [83]:
train.nunique()

Id                          1281
Age                           43
BusinessTravel                 3
DailyRate                    783
Department                     3
DistanceFromHome              29
Education                      5
EducationField                 6
EmployeeCount                  1
EmployeeNumber              1186
EnvironmentSatisfaction        4
Gender                         2
HourlyRate                    71
JobInvolvement                 4
JobLevel                       5
JobRole                        9
JobSatisfaction                4
MaritalStatus                  3
MonthlyIncome               1115
MonthlyRate                 1154
NumCompaniesWorked            10
Over18                         1
OverTime                       2
PercentSalaryHike             15
PerformanceRating              2
RelationshipSatisfaction       4
StandardHours                  1
StockOptionLevel               4
TotalWorkingYears             39
TrainingTimesLastYear          7
WorkLifeBa

In [84]:
train.drop(['Id','StandardHours','Over18','EmployeeCount'],1,inplace=True)
train.head()

Unnamed: 0,Age,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeNumber,EnvironmentSatisfaction,Gender,...,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,Attrition
0,38,Travel_Rarely,330,Research & Development,17,1,Life Sciences,1088,3,Female,...,1,1,20,4,2,19,9,1,9,No
1,36,Non-Travel,1351,Research & Development,9,4,Life Sciences,1949,1,Male,...,2,0,5,3,3,5,4,0,2,No
2,29,Travel_Rarely,1328,Research & Development,2,3,Life Sciences,94,3,Male,...,4,1,6,3,3,5,4,0,4,No
3,47,Travel_Rarely,1180,Research & Development,25,3,Medical,1993,1,Male,...,2,0,25,3,3,17,14,12,11,No
4,54,Travel_Rarely,584,Research & Development,22,5,Medical,1665,2,Female,...,3,1,36,6,3,10,8,4,7,No


In [85]:
# To Separate Categorical and numerical columns
def num_and_cat_columns(df):
    numeric_var = [key for key in dict(df.dtypes)
                   if dict(df.dtypes)[key]
                       in ['float64','float32','int32','int64']] # Numeric Variable
    
    cat_var = [key for key in dict(df.dtypes)
             if dict(df.dtypes)[key] in ['object'] ] # Categorical Varible
    
    return numeric_var,cat_var

In [86]:
# Call the num_and_cat_columns() with train as the parameter and store the results.
num_cols,cat_cols= num_and_cat_columns(train)
print("Numerical Columns-",num_cols)
print("Categorical Columns-",cat_cols)

Numerical Columns- ['Age', 'DailyRate', 'DistanceFromHome', 'Education', 'EmployeeNumber', 'EnvironmentSatisfaction', 'HourlyRate', 'JobInvolvement', 'JobLevel', 'JobSatisfaction', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked', 'PercentSalaryHike', 'PerformanceRating', 'RelationshipSatisfaction', 'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager']
Categorical Columns- ['BusinessTravel', 'Department', 'EducationField', 'Gender', 'JobRole', 'MaritalStatus', 'OverTime', 'Attrition']


In [87]:
train= pd.concat([pd.get_dummies(train['BusinessTravel']),train], axis=1)
train= pd.concat([pd.get_dummies(train['Department']),train], axis=1)
train= pd.concat([pd.get_dummies(train['EducationField']),train], axis=1)
train= pd.concat([pd.get_dummies(train['Gender']),train], axis=1)
train= pd.concat([pd.get_dummies(train['JobRole']),train], axis=1)
train= pd.concat([pd.get_dummies(train['MaritalStatus']),train], axis=1)
train= pd.concat([pd.get_dummies(train['OverTime']),train], axis=1)

In [88]:
train.drop(['BusinessTravel', 'Department', 'EducationField', 'Gender', 'JobRole', 'MaritalStatus', 'OverTime'],1,inplace=True)
train.shape

(1281, 53)

### Visualize the data

- Check for the categorical & continuous features. 
- Check out the best plots for plotting between categorical target and continuous features and try making some inferences from these plots.
- Clean the data, apply some data preprocessing and engineering techniques.

In [89]:
# Code starts here






# Code ends here

### Model building

- Now let's come to the actual task, using Decision Tree/Ensemble Technique, predict the `Attrition`. Use different techniques you have learned to imporove the performance of the model.
- Try improving upon the [ROC-AUC Score](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_auc_score.html)

In [90]:
# Code Starts here
X = train.drop('Attrition',1)
y = train['Attrition']

X_train, X_test, y_train, y_test = train_test_split(X,y, random_state = 0, test_size = 0.3)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

clf = RandomForestClassifier(min_samples_split = 12,random_state = 0, criterion = 'entropy')

clf.fit(X_train, y_train)
predicted = clf.predict(X_test)
print(clf.score(X_train, y_train))
print(clf.score(X_test, y_test))

# Code ends here

0.9575892857142857
0.8597402597402597


In [91]:
pd.DataFrame(predicted)[0].value_counts()

No     336
Yes     49
Name: 0, dtype: int64

In [92]:
accuracy_score(y_test, predicted)

0.8597402597402597

### Prediction on the test data and creating the sample submission file.

- Load the test data and store the `Id` column in a separate variable.
- Perform the same operations on the test data that you have performed on the train data.
- Create the submission file as a `csv` file consisting of the `Id` column from the test data and your prediction as the second column.

In [93]:
test = pd.read_csv('test.csv')
Id = test['Id'].copy()
test.drop(['Id','StandardHours','Over18','EmployeeCount'], 1, inplace = True)

In [94]:
test= pd.concat([pd.get_dummies(test['BusinessTravel']),test], axis=1)
test= pd.concat([pd.get_dummies(test['Department']),test], axis=1)
test= pd.concat([pd.get_dummies(test['EducationField']),test], axis=1)
test= pd.concat([pd.get_dummies(test['Gender']),test], axis=1)
test= pd.concat([pd.get_dummies(test['JobRole']),test], axis=1)
test= pd.concat([pd.get_dummies(test['MaritalStatus']),test], axis=1)
test= pd.concat([pd.get_dummies(test['OverTime']),test], axis=1)

In [95]:
test.drop(['BusinessTravel', 'Department', 'EducationField', 'Gender', 'JobRole', 'MaritalStatus', 'OverTime'],1,inplace=True)
test.shape

(321, 52)

In [96]:
test = scaler.transform(test)
pred = clf.predict(test)

In [97]:
submission = pd.DataFrame({'Id': Id, 'Attrition': pred})
submission

Unnamed: 0,Id,Attrition
0,963,No
1,1575,Yes
2,504,No
3,288,No
4,375,No
...,...,...
316,229,Yes
317,257,No
318,11,No
319,232,No


In [98]:
submission.to_csv('first_submission.csv', index = False)