In [1]:
import pandas as pd 
import numpy as np
pd.set_option('display.max_columns', 500)

from sklearn.metrics import f1_score, roc_auc_score
from sklearn.tree import DecisionTreeClassifier

In [2]:
x_train = pd.read_csv("data/train_im1.csv")
x_valid = pd.read_csv("data/test_im1.csv")
print(x_train.shape, x_valid.shape)

(54808, 29) (23490, 28)


In [3]:
x_train.columns

Index(['employee_id', 'department', 'region', 'education', 'gender',
       'recruitment_channel', 'no_of_trainings', 'age', 'previous_year_rating',
       'length_of_service', 'KPIs_met >80%', 'awards_won?',
       'avg_training_score', 'is_promoted', 'new_employee', 'employee_rating',
       'previous_year_rating_avg_training_score_perc',
       'department_avg_training_score_perc', 'department_region',
       'department_region_avg_training_score_perc', 'department_education',
       'department_education_avg_training_score_perc',
       'awards_won_avg_training_score_perc', 'kpi_avg_training_score_perc',
       'gender_avg_training_score_perc',
       'employee_rating_avg_training_score_perc', 'KPIs_met >80%_awards_won?',
       'kpi_award_avg_training_score_perc', 'region_avg_training_score_perc'],
      dtype='object')

In [4]:
x_train.head()

Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted,new_employee,employee_rating,previous_year_rating_avg_training_score_perc,department_avg_training_score_perc,department_region,department_region_avg_training_score_perc,department_education,department_education_avg_training_score_perc,awards_won_avg_training_score_perc,kpi_avg_training_score_perc,gender_avg_training_score_perc,employee_rating_avg_training_score_perc,KPIs_met >80%_awards_won?,kpi_award_avg_training_score_perc,region_avg_training_score_perc
0,65438,Sales & Marketing,region_7,Master's & above,f,sourcing,1,35,5.0,8,1,0,49,0,0,5_0,13.124947,41.288599,Sales & Marketing_region_7,41.707148,Sales & Marketing_Master's & above,40.914546,14.618589,11.077705,10.541319,13.124947,1_0,11.093056,14.577741
1,65141,Operations,region_22,Bachelor's,m,other,1,30,5.0,4,0,0,60,0,0,5_0,48.977941,54.749736,Operations_region_22,53.428202,Operations_Bachelor's,54.825858,49.486346,51.613312,51.610557,48.977941,0_0,51.865948,43.746111
2,7513,Sales & Marketing,region_19,Bachelor's,m,sourcing,1,34,3.0,7,0,0,50,0,0,3_0,17.434741,55.195962,Sales & Marketing_region_19,57.063197,Sales & Marketing_Bachelor's,55.405892,19.570212,21.87544,21.660692,17.434741,0_0,22.009351,17.906178
3,2542,Sales & Marketing,region_23,Bachelor's,m,other,2,39,1.0,10,0,0,50,0,0,1_0,29.792704,55.195962,Sales & Marketing_region_23,52.933333,Sales & Marketing_Bachelor's,55.405892,19.570212,21.87544,21.660692,29.792704,0_0,22.009351,19.489362
4,48945,Technology,region_26,Bachelor's,m,other,1,45,3.0,2,0,0,73,0,0,3_0,71.170373,0.798543,Technology_region_26,0.779221,Technology_Bachelor's,0.807841,73.13497,73.868852,71.526912,71.170373,0_0,74.126183,66.017699


In [5]:
fe_cols = ["no_of_trainings", "age", "previous_year_rating", "length_of_service",
          'KPIs_met >80%', 'awards_won?', "avg_training_score", "new_employee",
          "previous_year_rating_avg_training_score_perc", "department_avg_training_score_perc",
          "department_region_avg_training_score_perc", "department_education_avg_training_score_perc",
          "awards_won_avg_training_score_perc", "kpi_avg_training_score_perc",
          "region_avg_training_score_perc", "gender_avg_training_score_perc",
          "kpi_award_avg_training_score_perc", "employee_rating_avg_training_score_perc"]
fe_cols

['no_of_trainings',
 'age',
 'previous_year_rating',
 'length_of_service',
 'KPIs_met >80%',
 'awards_won?',
 'avg_training_score',
 'new_employee',
 'previous_year_rating_avg_training_score_perc',
 'department_avg_training_score_perc',
 'department_region_avg_training_score_perc',
 'department_education_avg_training_score_perc',
 'awards_won_avg_training_score_perc',
 'kpi_avg_training_score_perc',
 'region_avg_training_score_perc',
 'gender_avg_training_score_perc',
 'kpi_award_avg_training_score_perc',
 'employee_rating_avg_training_score_perc']

In [16]:
model = DecisionTreeClassifier(max_depth=6)
model.fit(x_train[fe_cols], x_train["is_promoted"])

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=6,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [17]:
train_pred = model.predict_proba(x_train[fe_cols])[:, 1]
print("train_auc:", roc_auc_score(x_train["is_promoted"], train_pred))

train_auc: 0.9003824937134036


In [18]:
for m in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]:
    print(m, ":", f1_score(x_train["is_promoted"], [1 if i> m else 0 for i in train_pred]))

0.1 : 0.3578033744321869
0.2 : 0.5186478732693027
0.3 : 0.5183262064752596
0.4 : 0.5154096329564813
0.5 : 0.5086559647466163
0.6 : 0.4999202424629128
0.7 : 0.4999202424629128
0.8 : 0.49671947511601855
0.9 : 0.49671947511601855


In [19]:
valid_pred = model.predict_proba(x_valid[fe_cols])[:, 1]
valid_pred = [1 if i>0.3 else 0 for i in valid_pred]

In [20]:
sub = pd.DataFrame([x_valid["employee_id"].values, valid_pred]).T
sub.columns = ["employee_id", "is_promoted"]

In [21]:
sub.to_csv("sub1.csv", index=False)

In [22]:
from sklearn.ensemble import RandomForestClassifier

In [23]:
model = RandomForestClassifier(max_depth=6, max_features=6, n_estimators=100)
model.fit(x_train[fe_cols], x_train["is_promoted"])

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=6, max_features=6, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [24]:
train_pred = model.predict_proba(x_train[fe_cols])[:, 1]
print("train_auc:", roc_auc_score(x_train["is_promoted"], train_pred))

train_auc: 0.9038390215195224


In [25]:
for m in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]:
    print(m, ":", f1_score(x_train["is_promoted"], [1 if i> m else 0 for i in train_pred]))

0.1 : 0.346876197776926
0.2 : 0.5302193913658881
0.3 : 0.5239548367409216
0.4 : 0.516822429906542
0.5 : 0.5100628930817611
0.6 : 0.5013477088948787
0.7 : 0.4976870314244697
0.8 : 0.4880645161290322
0.9 : 0.44032715740277084


In [26]:
valid_pred = model.predict_proba(x_valid[fe_cols])[:, 1]
valid_pred = [1 if i>0.3 else 0 for i in valid_pred]

In [27]:
sub1 = pd.DataFrame([x_valid["employee_id"].values, valid_pred]).T
sub1.columns = ["employee_id", "is_promoted"]

In [28]:
sub1.to_csv("sub2.csv", index=False)