In [335]:
import numpy as np
import pandas as pd
import math

#Model
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import cross_validate, GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score
from sklearn.feature_selection import RFE

#Feature engineering, pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler
from feature_engine.encoding import OneHotEncoder, CountFrequencyEncoder, OrdinalEncoder, RareLabelEncoder, MeanEncoder
from feature_engine import imputation as mdi
from feature_engine.discretisation import EqualFrequencyDiscretiser
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

#xgboost
import xgboost as xgb

In [291]:
# Load data
test = pd.read_csv('data_test_clean.csv')
#test=pd.read_csv('z_test.csv')

In [292]:
train = pd.read_csv('data_train_clean.csv')
#train = pd.read_csv('z_train.csv')

In [293]:
test.shape, train.shape

((3832, 11), (9995, 12))

In [294]:
# Manage data on train set
y_train = train['target']
X_train = train.drop(['enrollee_id', 'target'], axis=1)

# Manage data on test set
X_test = test.drop('enrollee_id', axis = 1)

In [295]:
y_train.shape, X_train.shape, X_test.shape

((9995,), (9995, 10), (3832, 10))

In [296]:
#RareLabelEncoder with feature "city"

rare_encoder = RareLabelEncoder(tol = 0, n_categories = 5, variables = 'city', max_n_categories = 5, replace_with = 'Other')

In [297]:
X_train=rare_encoder.fit_transform(X_train)
X_test=rare_encoder.transform(X_test)

In [298]:
X_train.city.unique(), X_test.city.unique()

(array(['city_103', 'Other', 'city_114', 'city_21', 'city_160', 'city_16'],
       dtype=object),
 array(['city_160', 'Other', 'city_103', 'city_114', 'city_21', 'city_16'],
       dtype=object))

In [299]:
#Null Imputation

#imputer = IterativeImputer()

In [300]:
#nulls = X_test.columns[X_test.isnull().any()].tolist()
#nulls

In [301]:
#for i in nulls:
    #X_train[i] = imputer.fit_transform(X_train[[i]])
    #X_train[i] = round(X_train[i])

In [302]:
#OneHotEncoder with features 'relevent_experience', 'gender', 'city', 'enrolled_university', 'education_level', 'major_discipline'

one_hot_encoder = OneHotEncoder(variables=['relevent_experience', 'gender', 'city', 'enrolled_university', 'education_level', 'major_discipline'], drop_last=False)

In [303]:
X_train=one_hot_encoder.fit_transform(X_train)
X_test=one_hot_encoder.transform(X_test)

In [304]:
#experience feature: Change to numeric

X_train['experience']=np.where(X_train['experience'] =='>20', '20', X_train['experience'])
X_train['experience']=np.where(X_train['experience'] =='<1', '0', X_train['experience'])

In [306]:
X_test['experience']=np.where(X_test['experience'] =='>20', '20', X_test['experience'])
X_test['experience']=np.where(X_test['experience'] =='<1', '0', X_test['experience'])

In [307]:
X_train['experience']=pd.to_numeric(X_train['experience'], errors='coerce')

In [308]:
X_test['experience']=pd.to_numeric(X_test['experience'], errors='coerce')

In [309]:
#last new job feature: Change to numeric

X_train['last_new_job']=np.where(X_train['last_new_job'] =='>4', '5', X_train['last_new_job'])
X_train['last_new_job']=np.where(X_train['last_new_job'] =='never', '0', X_train['last_new_job'])

In [310]:
X_test['last_new_job']=np.where(X_test['last_new_job'] =='>4', '5', X_test['last_new_job'])
X_test['last_new_job']=np.where(X_test['last_new_job'] =='never', '0', X_test['last_new_job'])

In [311]:
X_train['last_new_job']=pd.to_numeric(X_train['last_new_job'], errors='coerce')

In [312]:
X_test['last_new_job']=pd.to_numeric(X_test['last_new_job'], errors='coerce')

In [313]:
X_train.dtypes

city_development_index                         float64
experience                                       int64
last_new_job                                     int64
training_hours                                   int64
relevent_experience_Has relevent experience      int32
relevent_experience_No relevent experience       int32
gender_Male                                      int32
gender_Female                                    int32
gender_Other                                     int32
city_city_103                                    int32
city_Other                                       int32
city_city_114                                    int32
city_city_21                                     int32
city_city_160                                    int32
city_city_16                                     int32
enrolled_university_no_enrollment                int32
enrolled_university_Part time course             int32
enrolled_university_Full time course             int32
education_

In [314]:
X_test.dtypes

city_development_index                         float64
experience                                       int64
last_new_job                                     int64
training_hours                                   int64
relevent_experience_Has relevent experience      int32
relevent_experience_No relevent experience       int32
gender_Male                                      int32
gender_Female                                    int32
gender_Other                                     int32
city_city_103                                    int32
city_Other                                       int32
city_city_114                                    int32
city_city_21                                     int32
city_city_160                                    int32
city_city_16                                     int32
enrolled_university_no_enrollment                int32
enrolled_university_Part time course             int32
enrolled_university_Full time course             int32
education_

In [317]:
# Normalize data
scaler = RobustScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [318]:
# GridSearchCV with XGBoost model
xgb_model = xgb.XGBClassifier(random_state=42)
param_grid = {'objective':['binary:logistic'],
              'learning_rate': [0.001,0.05,0.1, 10], 
              'max_depth': [2,3,4,5,6],
              'min_child_weight': [11],
              'subsample': [0.8],
              'colsample_bytree': [0.7],
              'n_estimators': [1000]}
#

In [319]:
grid = GridSearchCV(estimator = xgb_model, cv=5, param_grid = param_grid , scoring = 'accuracy', verbose = 1, n_jobs = -1, refit=True)
grid.fit(X_train,y_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  4.1min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  8.6min finished




GridSearchCV(cv=5,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=100, n_jobs...
                                     reg_alpha=None, reg_lambda=None,
                                     scale_pos_weight=None, subsample=None,
                                     tree_method=None, validate_parameters=None,
                                     verbosity=None),
  

In [320]:
print("Best Score:" + str(grid.best_score_))

Best Score:0.7902951475737868


In [321]:
print("Best Parameters: " + str(grid.best_params_))

Best Parameters: {'colsample_bytree': 0.7, 'learning_rate': 0.05, 'max_depth': 2, 'min_child_weight': 11, 'n_estimators': 1000, 'objective': 'binary:logistic', 'subsample': 0.8}


In [322]:
best_parameters = grid.best_params_
best_parameters

{'colsample_bytree': 0.7,
 'learning_rate': 0.05,
 'max_depth': 2,
 'min_child_weight': 11,
 'n_estimators': 1000,
 'objective': 'binary:logistic',
 'subsample': 0.8}

In [323]:
# XGBoost model with RFE
xgb_model = xgb.XGBClassifier(**best_parameters)
xgb_model.fit(X_train,y_train)

selector = RFE(xgb_model, 200, step=1)
selector.fit(X_train,y_train)







RFE(estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                            colsample_bylevel=1, colsample_bynode=1,
                            colsample_bytree=0.7, gamma=0, gpu_id=-1,
                            importance_type='gain', interaction_constraints='',
                            learning_rate=0.05, max_delta_step=0, max_depth=2,
                            min_child_weight=11, missing=nan,
                            monotone_constraints='()', n_estimators=1000,
                            n_jobs=4, num_parallel_tree=1, random_state=0,
                            reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
                            subsample=0.8, tree_method='exact',
                            validate_parameters=1, verbosity=None),
    n_features_to_select=200)

In [324]:
#predict results
y_predict = selector.predict(X_test)

print(y_predict)

[0. 0. 0. ... 0. 0. 0.]


In [325]:
len(y_predict)

3832

In [331]:
y_predict_train = selector.predict(X_train)

In [332]:
confusion_matrix(y_train, y_predict_train)

array([[7042,  586],
       [1428,  939]], dtype=int64)

In [336]:
auc=roc_auc_score(y_train, y_predict_train)
print(auc)

0.6599412278025791


Preparation of submission file

In [326]:
my_submission = pd.DataFrame({'enrollee_id': test.enrollee_id, 'target': y_predict})
my_submission

Unnamed: 0,enrollee_id,target
0,23603,0.0
1,22499,0.0
2,10465,0.0
3,8293,0.0
4,4246,0.0
...,...,...
3827,8880,0.0
3828,7886,0.0
3829,12279,0.0
3830,5326,0.0


In [327]:
my_submission.target.value_counts()

0.0    3651
1.0     181
Name: target, dtype: int64

In [328]:
my_submission.shape[0]

3832

In [329]:
# you could use any filename. We choose submission here
my_submission.to_csv('submission.csv', index=False)

In [None]:
#final score: 53%