# Promotion Prediction
### Data Science Nigeria Kaggle Competition

In [None]:
import numpy as np
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, confusion_matrix, accuracy_score, f1_score, precision_score, recall_score

In [None]:
data = pd.read_csv('train.csv')
data.set_index('EmployeeNo',inplace=True)

test_data = pd.read_csv('test.csv')
test_data.set_index('EmployeeNo',inplace=True)

In [None]:
data.head()

In [None]:
print(data.shape, test_data.shape)

In [None]:
data.describe()

In [None]:
data1=data[data.columns[:9]].drop(columns=['Year_of_birth','Year_of_recruitment'])
data2=data[data.columns[9:]].drop(columns=['Training_score_average','State_Of_Origin'])

In [None]:
fig = plt.figure(figsize=(15,12))
plt.suptitle('Pie Chart Distributions', fontsize=20)
for i in range(1, data1.shape[1]+1):
    plt.subplot(3,3,i)
    f = plt.gca()
    f.axes.get_yaxis().set_visible(False)
    f.set_title(data1.columns.values[i-1])
    
    values = data1.iloc[:, i - 1].value_counts(normalize = True).values
    index = data1.iloc[:, i - 1].value_counts(normalize = True).index
    plt.pie(values, labels = index, autopct='%1.1f%%')
    plt.axis('equal')
fig.tight_layout(rect=[0, 0.03, 1, 0.95])

In [None]:
fig = plt.figure(figsize=(15,12))
plt.suptitle('Pie Chart Distributions', fontsize=20)
for i in range(1, data2.shape[1]+1):
    plt.subplot(3,3,i)
    f = plt.gca()
    f.axes.get_yaxis().set_visible(False)
    f.set_title(data2.columns.values[i-1])
    
    values = data2.iloc[:, i - 1].value_counts(normalize = True).values
    index = data2.iloc[:, i - 1].value_counts(normalize = True).index
    plt.pie(values, labels = index, autopct='%1.1f%%')
    plt.axis('equal')
fig.tight_layout(rect=[0, 0.03, 1, 0.95])

In [None]:
data.groupby('Previous_Award')['Promoted_or_Not'].value_counts()

In [None]:
data.groupby('Targets_met')['Promoted_or_Not'].value_counts()

In [None]:
data.groupby('Past_Disciplinary_Action')['Promoted_or_Not'].value_counts()

In [None]:
data.groupby('Foreign_schooled')['Promoted_or_Not'].value_counts()

In [None]:
data.groupby('Previous_IntraDepartmental_Movement')['Promoted_or_Not'].value_counts()

In [None]:
len(data.columns)

In [None]:
target = data['Promoted_or_Not']
data = data.drop(columns='Promoted_or_Not')
merged = pd.concat([data, test_data])

In [None]:
merged['Past_Disciplinary_Action'] = merged.Past_Disciplinary_Action.map(
    {'No' : 1,'Yes' : 0})

In [None]:
merged['Previous_IntraDepartmental_Movement'] = merged.Previous_IntraDepartmental_Movement.map(
    {'No' : 1,'Yes' : 0})

In [None]:
merged['Foreign_schooled'] = merged.Foreign_schooled.map(
    {'No' : 0,'Yes' : 1})

In [None]:
merged['No_of_previous_employers'] = merged.No_of_previous_employers.map(
    {'0' : 6,'1' : 5,'2' : 4,'3':3,'4':2,'5':1,'More than 5':0})

In [None]:
merged['Qualification'].fillna('First Degree or HND', inplace=True)

In [None]:
cur_Yr = 2019
merged['Years_in_Service'] = merged['Year_of_recruitment'].apply(lambda x :cur_Yr - x)
merged['Age'] = merged['Year_of_birth'].apply(lambda x :cur_Yr - x)
merged = merged.drop(['Year_of_recruitment','Year_of_birth'], axis=1)

In [None]:
merged.info()

In [None]:
##One hot encoding
merged = pd.concat([merged[['Trainings_Attended','Last_performance_score','Targets_met',
                                'Previous_Award','Training_score_average', 'Previous_IntraDepartmental_Movement',
                                'No_of_previous_employers','Years_in_Service','Age',
                                'Past_Disciplinary_Action', 'Foreign_schooled']],
               pd.get_dummies(merged['Division'],drop_first = True),
                      pd.get_dummies(merged['Qualification'],drop_first = True),
                      pd.get_dummies(merged['Gender'],drop_first = True),
                      pd.get_dummies(merged['Channel_of_Recruitment'],drop_first = True),
                      pd.get_dummies(merged['State_Of_Origin'],drop_first = True),
                      pd.get_dummies(merged['Marital_Status'],drop_first = True)],axis=1)

In [None]:
merged.columns

In [None]:
merged[0:38312].\
corrwith(target).plot.bar(figsize=(20,10), \
                              title = 'Correletion with Response variable',\
                             fontsize = 15, rot = 45, grid = True)

In [None]:
# ### Scaling
# from sklearn.preprocessing import StandardScaler
# sc_X = StandardScaler()

# X_train2 = pd.DataFrame(sc_X.fit_transform(merged))
# X_train2.columns = merged.columns.values
# X_train2.index = merged.index.values
# merged = X_train2

## Model Building

In [None]:
X_valid = merged[38312:]
X = merged[0:38312]
y = target
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=27)

In [None]:
X_valid.head()

In [None]:
### Hypertuning
### Param test 1

param_test1 = {
  'min_child_weight':range(1,6,2),
  'max_depth': range(3,10,2)
  #'n_estimators':[150,200,300,400],
  #'scale_pos_weight':[1,2,3,4],
  #'colsample_bytree':[0.7,0.8], 
  #'subsample':[0.7,0.8],
  #'gamma':[0,0.2,0.4]    
}
gsearch1 = GridSearchCV(estimator = XGBClassifier(learning_rate=0.1, n_estimators=150, max_depth=5, 
                                                  min_child_weight=1, gamma=0, subsample=0.8, 
                                                  colsample_bytree=0.8, objective= 'binary:logistic', 
                                                  nthread=4, scale_pos_weight=3,seed=27), 
                        param_grid = param_test1, scoring='f1',n_jobs=4,iid=False, cv=5)
gsearch1.fit(X_train, y_train)
print("gsearch.best_params_",gsearch1.best_params_)
print("gsearch.best_score_",gsearch1.best_score_)
print("gsearch.best_estimator_",gsearch1.best_estimator_)

In [None]:
### Param test 2
param_test2 = {
  'min_child_weight':[1],
  'max_depth': [5,6]
  #'n_estimators':[150,200,300,400],
  #'scale_pos_weight':[1,2,3,4],
  #'colsample_bytree':[0.7,0.8], 
  #'subsample':[0.7,0.8],
  #'gamma':[0,0.2,0.4]    
}
gsearch2 = GridSearchCV(estimator = XGBClassifier(learning_rate=0.1, n_estimators=150, max_depth=5, 
                                                  min_child_weight=1, gamma=0, subsample=0.8, 
                                                  colsample_bytree=0.8, objective= 'binary:logistic', 
                                                  nthread=4, scale_pos_weight=3,seed=27), 
                        param_grid = param_test2, scoring='f1',n_jobs=4,iid=False, cv=5)
gsearch2.fit(X_train, y_train)
print("gsearch.best_params_",gsearch2.best_params_)
print("gsearch.best_score_",gsearch2.best_score_)
print("gsearch.best_estimator_",gsearch2.best_estimator_)

In [None]:
### Param test 2

param_test2 = {
#   'min_child_weight':[1,2],
#   'max_depth': [4,5,6]
  'n_estimators':[150,200,250,500],
  'scale_pos_weight':[1,2,3,4],
  #'colsample_bytree':[0.7,0.8], 
  #'subsample':[0.7,0.8],
  #'gamma':[0,0.2,0.4]    
}
gsearch2 = GridSearchCV(estimator = XGBClassifier(learning_rate=0.1, n_estimators=150, max_depth=4, 
                                                  min_child_weight=1, gamma=0, subsample=0.8, 
                                                  colsample_bytree=0.8, objective= 'binary:logistic', 
                                                  nthread=4, scale_pos_weight=3,seed=27), 
                        param_grid = param_test2, scoring='f1',n_jobs=4,iid=False, cv=5)
gsearch2.fit(X_train, y_train)
print("gsearch.best_params_",gsearch2.best_params_)
print("gsearch.best_score_",gsearch2.best_score_)
print("gsearch.best_estimator_",gsearch2.best_estimator_)

In [None]:
### Param test 3

param_test3 = {
#   'min_child_weight':[1,2],
#   'max_depth': [4,5,6]
#   'n_estimators':[150,200,300,400],
#   'scale_pos_weight':[1,2,3,4],
  'colsample_bytree':[0.7,0.8], 
  'subsample':[0.7, 0.8],
  'gamma':[0,0.2,0.4]    
}
gsearch3 = GridSearchCV(estimator = XGBClassifier(learning_rate=0.1, n_estimators=150, max_depth=4, 
                                                  min_child_weight=1, gamma=0, subsample=0.8, 
                                                  colsample_bytree=0.8, objective= 'binary:logistic', 
                                                  nthread=4, scale_pos_weight=3,seed=27), 
                        param_grid = param_test3, scoring='f1',n_jobs=4,iid=False, cv=5)
gsearch3.fit(X_train, y_train)
print("gsearch.best_params_",gsearch3.best_params_)
print("gsearch.best_score_",gsearch3.best_score_)
print("gsearch.best_estimator_",gsearch3.best_estimator_)

In [None]:
### Param test 4

param_test4 = {
#   'min_child_weight':[1,2],
#   'max_depth': [4,5,6]
#   'n_estimators':[150,200,300,400],
#   'scale_pos_weight':[1,2,3,4],
  'colsample_bytree':[0.7,0.8], 
  'subsample':[0.7, 0.8],
#   'gamma':[0.4]    
}
gsearch4 = GridSearchCV(estimator = XGBClassifier(learning_rate=0.1, 
n_estimators=150, max_depth=4, min_child_weight=1, gamma=0.4, 
subsample=0.8, colsample_bytree=0.8, objective= 'binary:logistic', 
nthread=4, scale_pos_weight=3,seed=27), 
param_grid = param_test4, scoring='f1',n_jobs=4,iid=False, cv=5)
gsearch4.fit(X_train, y_train)
print("gsearch.best_params_",gsearch4.best_params_)
print("gsearch.best_score_",gsearch4.best_score_)
print("gsearch.best_estimator_",gsearch4.best_estimator_)

In [None]:
### Param test 5


param_test5 = {
'learning_rate':[0.1,0.01,0.001]   
}
gsearch5 = GridSearchCV(estimator = XGBClassifier(learning_rate=0.1, 
n_estimators=150, max_depth=4, min_child_weight=1, gamma=0.4, 
subsample=0.8, colsample_bytree=0.8, objective= 'binary:logistic', 
nthread=4, scale_pos_weight=3,seed=27), 
param_grid = param_test5, scoring='f1',n_jobs=4,iid=False, cv=5)
gsearch5.fit(X_train, y_train)
print("gsearch.best_params_",gsearch5.best_params_)
print("gsearch.best_score_",gsearch5.best_score_)
print("gsearch.best_estimator_",gsearch5.best_estimator_)

In [None]:
### Param test 6

param_test6 = {
'reg_alpha':[0,0.1,1],
'reg_lambda':[0,0.1,1],   
}
gsearch6 = GridSearchCV(estimator = XGBClassifier(learning_rate=0.1, 
n_estimators=150, max_depth=4, min_child_weight=1, gamma=0.4, 
subsample=0.8, colsample_bytree=0.8, objective= 'binary:logistic', 
nthread=4, scale_pos_weight=3,seed=27), 
param_grid = param_test6, scoring='f1',n_jobs=4,iid=False, cv=5)
gsearch6.fit(X_train, y_train)
print("gsearch.best_params_",gsearch6.best_params_)
print("gsearch.best_score_",gsearch6.best_score_)
print("gsearch.best_estimator_",gsearch6.best_estimator_)

In [None]:
model = XGBClassifier(learning_rate=0.1, 
n_estimators=250, max_depth=4, min_child_weight=1, gamma=0.4, 
subsample=0.8, colsample_bytree=0.8, objective= 'binary:logistic', 
nthread=4, scale_pos_weight=3,seed=27)
model.fit(X_train, y_train)

print(model.score(X_train, y_train))

y_pred = model.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print("accuracy_score:", accuracy_score(y_test, y_pred))
print("precision_score:", precision_score(y_test, y_pred)) # tp / (tp + fp)
print("recall_score:", recall_score(y_test, y_pred)) # tp / (tp + fn)
print("f1_score:", f1_score(y_test, y_pred))
print("confusion_matrix :\n", pd.DataFrame(cm))
print("roc_auc test set:", roc_auc_score(y_test, model.predict_proba(X_test)[:,1]))
print("roc_auc training set:", roc_auc_score(y_train, model.predict_proba(X_train)[:,1]))

In [None]:
model = XGBClassifier(learning_rate=0.1, 
n_estimators=250, max_depth=4, min_child_weight=1, gamma=0.4, 
subsample=0.8, colsample_bytree=0.8, objective= 'binary:logistic', 
nthread=4, scale_pos_weight=3,seed=27)
model.fit(X, y)

print(model.score(X_train, y_train))

y_pred = model.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print("accuracy_score:", accuracy_score(y_test, y_pred))
print("precision_score:", precision_score(y_test, y_pred)) # tp / (tp + fp)
print("recall_score:", recall_score(y_test, y_pred)) # tp / (tp + fn)
print("f1_score:", f1_score(y_test, y_pred))
print("confusion_matrix :\n", pd.DataFrame(cm))
print("roc_auc test set:", roc_auc_score(y_test, model.predict_proba(X_test)[:,1]))
print("roc_auc training set:", roc_auc_score(y_train, model.predict_proba(X_train)[:,1]))

In [None]:
# X_valid = merged[38312:]
X_val = X_valid.copy()
y_valid = model.predict(X_valid)
submission = X_val
submission['Promoted_or_Not'] = y_valid
print(submission['Promoted_or_Not'].value_counts())

# Exporting results a csv file
submission = submission[['Promoted_or_Not']]
submission.reset_index(inplace=True)
submission.to_csv("Prediction8.csv",index=False)