In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
import os

In [2]:
from sklearn.linear_model import LogisticRegression 
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn import model_selection, tree, preprocessing, metrics, linear_model
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neural_network import MLPClassifier

In [3]:
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [4]:
from sklearn.model_selection import GridSearchCV

### Importing datasets

In [5]:
x_train = pd.read_csv('x_train.csv')
y_train = pd.read_csv('y_train.csv')
x_test = pd.read_csv('x_test.csv')

In [23]:
x_train.head()

Unnamed: 0,Age,DistanceFromHome,Education,EnvironmentSatisfaction,JobInvolvement,JobSatisfaction,MonthlyIncome,NumCompaniesWorked,PercentSalaryHike,PerformanceRating,...,JobRole_Manufacturing Director,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative,MaritalStatus_Divorced,MaritalStatus_Married,MaritalStatus_Single,OverTime_No,OverTime_Yes
0,30,2,3,3,3,4,2564,0,14,3,...,0,0,0,0,0,0,0,1,1,0
1,36,12,4,3,3,3,4663,9,12,3,...,1,0,0,0,0,0,1,0,0,1
2,55,2,1,3,3,4,5160,4,16,3,...,0,0,0,1,0,0,0,1,1,0
3,39,24,1,1,3,4,4108,7,13,3,...,0,0,1,0,0,0,0,1,1,0
4,37,3,3,3,3,3,9434,1,15,3,...,1,0,0,0,0,0,1,0,1,0


In [6]:
y_train = np.ravel(y_train)

In [10]:
type(y_train)

numpy.ndarray

In [13]:
## Decision Tree

In [14]:
parameters = {'max_depth': [2, 4, 5, 7, 9, 10]}

grid_search = GridSearchCV(DecisionTreeClassifier(), parameters, cv=3, return_train_score=True)
grid_search.fit(x_train, y_train)

grid_search.best_params_

{'max_depth': 10}

In [15]:
for i in range(6):
    print('Parameters: ', grid_search.cv_results_['params'][i])

    print('Mean Test Score: ', grid_search.cv_results_['mean_test_score'][i])
    
    print('Rank: ', grid_search.cv_results_['rank_test_score'][i])

Parameters:  {'max_depth': 2}
Mean Test Score:  0.6652334152334153
Rank:  6
Parameters:  {'max_depth': 4}
Mean Test Score:  0.7512285012285013
Rank:  5
Parameters:  {'max_depth': 5}
Mean Test Score:  0.7843980343980343
Rank:  4
Parameters:  {'max_depth': 7}
Mean Test Score:  0.8611793611793612
Rank:  3
Parameters:  {'max_depth': 9}
Mean Test Score:  0.89004914004914
Rank:  2
Parameters:  {'max_depth': 10}
Mean Test Score:  0.8961916461916462
Rank:  1


In [16]:
decision_tree_model = DecisionTreeClassifier( \
    max_depth = 5).fit(x_train, y_train)

In [17]:
y_pred = decision_tree_model.predict(x_test)

In [20]:
acc = accuracy_score(y_train,predict_train)
print("accuracy_score : ", acc)

accuracy_score :  0.8476658476658476


In [19]:
predict_train = decision_tree_model.predict(x_train)

In [50]:
prec = precision_score(y_train, predict_train)
recall = recall_score(y_train, predict_train)
print("precision_score : ", prec)
print("recall_score : ", recall)

precision_score :  0.8185053380782918
recall_score :  0.8789808917197452


In [21]:
rnd_clf = RandomForestClassifier(n_estimators = 400,
                                 max_leaf_nodes = 16,
                                 n_jobs = -1)

rnd_clf.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=16,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=400,
                       n_jobs=-1, oob_score=False, random_state=None, verbose=0,
                       warm_start=False)

In [22]:
y_pred = rnd_clf.predict(x_test)

In [23]:
predict_train = rnd_clf.predict(x_train)

In [24]:
acc = accuracy_score(y_train,predict_train)
print("accuracy_score : ", acc)
prec = precision_score(y_train, predict_train)
recall = recall_score(y_train, predict_train)
print("precision_score : ", prec)
print("recall_score : ", recall)

accuracy_score :  0.8421375921375921
precision_score :  0.8464566929133859
recall_score :  0.821656050955414


In [26]:
#K fold cross validation

In [28]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

In [43]:
def fit_ml_algo(algo, x_train, y_train, cv):
    
    # One Pass
    model = algo.fit(x_train, y_train)
    acc = round(model.score(x_train, y_train) * 100, 2)
    
    # Cross Validation 
    predict_train = model_selection.cross_val_predict(algo, 
                                                  x_train, 
                                                  y_train, 
                                                  cv=cv, 
                                                  n_jobs = -1)
    # Cross-validation accuracy metric
    acc_cv = round(metrics.accuracy_score(y_train, predict_train) * 100, 2)
    
    return predict_train, acc, acc_cv

In [44]:

# Logistic Regression

train_pred_log, acc_log, acc_cv_log = fit_ml_algo(LogisticRegression(), 
                                                               x_train, 
                                                               y_train, 
                                                                    10)

print("Accuracy: %s" % acc_log)
print("Accuracy CV 10-Fold: %s" % acc_cv_log)




Accuracy: 79.3
Accuracy CV 10-Fold: 78.44


In [47]:
model = LogisticRegression().fit(x_train, y_train)



In [59]:
# k-Nearest Neighbours

train_pred_knn, acc_knn, acc_cv_knn = fit_ml_algo(KNeighborsClassifier(), 
                                                  x_train, 
                                                  y_train, 
                                                  10)

print("Accuracy: %s" % acc_knn)
print("Accuracy CV 10-Fold: %s" % acc_cv_knn)


Accuracy: 86.61
Accuracy CV 10-Fold: 80.28


In [64]:
model = KNeighborsClassifier().fit(x_train, y_train)

In [74]:

# Gaussian Naive Bayes
train_pred_gaussian, acc_gaussian, acc_cv_gaussian = fit_ml_algo(GaussianNB(), 
                                                                      x_train, 
                                                                      y_train, 
                                                                           10)
print("Accuracy: %s" % acc_gaussian)
print("Accuracy CV 10-Fold: %s" % acc_cv_gaussian)


Accuracy: 69.1
Accuracy CV 10-Fold: 68.8


In [75]:
model = GaussianNB().fit(x_train, y_train)

In [83]:

# Stochastic Gradient Descent

train_pred_sgd, acc_sgd, acc_cv_sgd = fit_ml_algo(SGDClassifier(), 
                                                  x_train, 
                                                  y_train,
                                                  10)

print("Accuracy: %s" % acc_sgd)
print("Accuracy CV 10-Fold: %s" % acc_cv_sgd)


Accuracy: 56.08
Accuracy CV 10-Fold: 54.73


In [85]:
# Decision Tree Classifier
train_pred_dt, acc_dt, acc_cv_dt = fit_ml_algo(DecisionTreeClassifier(), 
                                                                x_train, 
                                                                y_train,
                                                                10)

print("Accuracy: %s" % acc_dt)
print("Accuracy CV 10-Fold: %s" % acc_cv_dt)


Accuracy: 100.0
Accuracy CV 10-Fold: 92.81


In [86]:
# Gradient Boosting Trees

train_pred_gbt, acc_gbt, acc_cv_gbt = fit_ml_algo(GradientBoostingClassifier(), 
                                                                       x_train, 
                                                                       y_train,
                                                                       10)

print("Accuracy: %s" % acc_gbt)
print("Accuracy CV 10-Fold: %s" % acc_cv_gbt)


Accuracy: 96.13
Accuracy CV 10-Fold: 91.65


In [87]:
model = GradientBoostingClassifier().fit(x_train, y_train)

In [95]:
##Stacked machine learning

In [97]:
bag_clf = BaggingClassifier(DecisionTreeClassifier(splitter ='random', max_leaf_nodes=12), 
                                                   n_estimators = 400, 
                                                   max_samples = 1.0,
                                                   bootstrap = True,
                                                   n_jobs = -1)

bag_clf.fit(x_train, y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(class_weight=None,
                                                        criterion='gini',
                                                        max_depth=None,
                                                        max_features=None,
                                                        max_leaf_nodes=12,
                                                        min_impurity_decrease=0.0,
                                                        min_impurity_split=None,
                                                        min_samples_leaf=1,
                                                        min_samples_split=2,
                                                        min_weight_fraction_leaf=0.0,
                                                        presort=False,
                                                        random_state=None,
                                                        splitter='random'),
    

In [98]:
y_pred = bag_clf.predict(x_test)

## Ada boosting

In [11]:
ada_clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), 
                             n_estimators=100, 
                             algorithm='SAMME.R', 
                             learning_rate=0.5)

ada_clf.fit(x_train, y_train)

AdaBoostClassifier(algorithm='SAMME.R',
                   base_estimator=DecisionTreeClassifier(class_weight=None,
                                                         criterion='gini',
                                                         max_depth=1,
                                                         max_features=None,
                                                         max_leaf_nodes=None,
                                                         min_impurity_decrease=0.0,
                                                         min_impurity_split=None,
                                                         min_samples_leaf=1,
                                                         min_samples_split=2,
                                                         min_weight_fraction_leaf=0.0,
                                                         presort=False,
                                                         random_state=None,
                             

In [13]:
y_pred = ada_clf.predict(x_test)

In [15]:
predict_train = ada_clf.predict(x_train)

In [16]:
acc = accuracy_score(y_train,predict_train)
print("accuracy_score : ", acc)
prec = precision_score(y_train, predict_train)
recall = recall_score(y_train, predict_train)
print("precision_score : ", prec)
print("recall_score : ", recall)

accuracy_score :  0.8507371007371007
precision_score :  0.8395989974937343
recall_score :  0.8535031847133758


In [8]:
mlp = MLPClassifier(hidden_layer_sizes=(8,8,8), activation='relu', solver='adam', max_iter=500)
mlp.fit(x_train,y_train)

predict_train = mlp.predict(x_train)
predict_test = mlp.predict(x_test)

In [9]:
acc = accuracy_score(y_train,predict_train)
print("accuracy_score : ", acc)
prec = precision_score(y_train, predict_train)
recall = recall_score(y_train, predict_train)
print("precision_score : ", prec)
print("recall_score : ", recall)

accuracy_score :  0.7002457002457002
precision_score :  0.8429561200923787
recall_score :  0.46496815286624205


In [99]:
predict_train = bag_clf.predict(x_train)

In [100]:
acc = accuracy_score(y_train,predict_train)
print("accuracy_score : ", acc)

accuracy_score :  0.797911547911548


In [10]:
probs = mlp.predict_proba(x_test)[:,1]

In [136]:
proba1 = pd.read_csv('xgboo1.csv')

In [137]:
proba1.head(20)

Unnamed: 0.1,Unnamed: 0,0
0,0,0.002018
1,1,0.093577
2,2,0.135867
3,3,0.059967
4,4,0.007721
5,5,0.028273
6,6,0.152351
7,7,0.057191
8,8,0.028168
9,9,0.09312


In [111]:
id1 = pd.read_csv('ID.csv')

In [52]:
probs = decision_tree_model.predict_proba(x_test)[:,1]

In [62]:
probs = rnd_clf.predict_proba(x_test)[:,1]

In [124]:
proba1 = pd.DataFrame(probs)

In [138]:
submit1 = pd.concat([id1, proba1], axis=1)

In [139]:
submit1.rename(columns={'0': 'Attrition'},inplace =True)

In [143]:
submit1.to_csv('submit11.csv', index=False)

In [142]:
submit1.head()

Unnamed: 0,Id,Attrition
0,1,0.002018
1,2,0.093577
2,3,0.135867
3,4,0.059967
4,5,0.007721


In [141]:
submit1.drop(submit1.columns[submit1.columns.str.contains('Unnamed',case=False)], axis=1,inplace=True)