<h3>Import nessesary libraries</h3>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import SMOTE, SMOTENC
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.exceptions import DataConversionWarning
import warnings
warnings.filterwarnings(action='ignore', category=DataConversionWarning)
warnings.filterwarnings(action='ignore', category=FutureWarning)
np.random.seed(42)

<h3>Data preprocessing</h3>

In [2]:
df_train = pd.read_excel('Task_Python.xlsx', sheet_name='Train')
df_test = pd.read_excel('Task_Python.xlsx', sheet_name='Test')

In [3]:
X_train_pre = df_train.drop(['y'], axis=1)
y_train_pre = np.where(df_train.y == 'yes', 1, 0)
X_test_pre = df_test
columns = X_train_pre.columns
# We will use list of categorical variables later
categorical_features = []
is_categorical = X_train_pre.dtypes == 'object'
for col in X_train_pre.columns:
    if is_categorical[col]: categorical_features.append(col)

In [10]:
X_train = X_train_pre.reset_index(drop=True)
y_train = y_train_pre
# Encoding for categorical variables, as most models don't work with strings
for feature in categorical_features:
    le = LabelEncoder()
    X_train[feature]  = le.fit_transform(X_train[feature])
# Standartization, as most models are sensetive to magnitude of variables
Scaler = StandardScaler()
X_train = Scaler.fit_transform(X_train)
# Our data is imbalanced (1:9). So, we use SMOTE to synthesize more class 1 observations
smote = SMOTE()
X_train, y_train = smote.fit_sample(X_train, y_train)
X_train = pd.DataFrame(X_train, columns = columns)

<h1>Modeling</h1>

In [5]:
model_list = [
            ("Logistic_Regression", LogisticRegression()),
            ("SVC", SVC()),
            ("LDA", LinearDiscriminantAnalysis()),
            ("QDA", QuadraticDiscriminantAnalysis()),
            ("RF", RandomForestClassifier()),
            ("GBC", GradientBoostingClassifier()),
            ("XGB", XGBClassifier()),
            ("LGBM", LGBMClassifier())
        ]  
    
scoring = 'f1'
n_folds = 5
results, names  = [], [] 

for name, model  in model_list:
    kfold = StratifiedKFold(n_splits=n_folds, shuffle=True)
    cv_results = cross_val_score(model, X_train, y_train, cv= kfold,
                                 scoring=scoring, n_jobs=-1)    
    names.append(name)
    results.append(cv_results)    
    print(f'{name}: {cv_results.mean()} (+/- {cv_results.std()})')
    
results = pd.DataFrame(np.array(results).T, columns = names)

Logistic_Regression: 0.8859845275736029 (+/- 0.008221723381851141)
SVC: 0.929592859445572 (+/- 0.009276817484059006)
LDA: 0.8630721267393469 (+/- 0.010367153357787106)
QDA: 0.7859535879998627 (+/- 0.015800058660551727)
RF: 0.9449092767354174 (+/- 0.004090688529509977)
GBC: 0.9404462318529934 (+/- 0.007534999473692339)
XGB: 0.9377444149326349 (+/- 0.00908735878917265)
LGBM: 0.9533375535498585 (+/- 0.008795804097341606)


<h5>Cat Boosting</h5><br>This model has its own specific requirements (categotical variables can be left alone and passed to a model without encoding)

In [6]:
X_train = X_train_pre.reset_index(drop=True)
y_train = y_train_pre
for col in X_train.columns:
    if col not in categorical_features:
        Scaler = StandardScaler()
        X_train[col] = Scaler.fit_transform(X_train[col].values.reshape(-1,1))

smotenc = SMOTENC(categorical_features=[X_train.columns.get_loc(i) for i in categorical_features])
X_train, y_train = smotenc.fit_sample(X_train, y_train)
X_train = pd.DataFrame(X_train, columns = columns)

In [7]:
model = CatBoostClassifier(cat_features=categorical_features)

scoring = 'f1'
n_folds = 5

kfold = StratifiedKFold(n_splits=n_folds, shuffle=True)
cv_results = cross_val_score(model, X_train, y_train, cv= kfold,
                             scoring=scoring, n_jobs=-1)    

print(f'Cat_Boosting: {cv_results.mean()} (+/- {cv_results.std()})')

Cat_Boosting: 0.9415684137438074 (+/- 0.005140694316555464)


<h3>Regularization of the best model</h3><h5>Light gradient boosting machine hyperparameters tuning</h5>

In [11]:
model = LGBMClassifier()

param_grid_et =  {
        'n_estimators': [100, 300, 500],
        'learning_rate': [0.01, 0.05, 0.1],
        'max_depth': [5, 10, 15],
        'num_leaves': [200, 500, 900]
               }

grid_rf = GridSearchCV(estimator=model,
                               param_grid= param_grid_et,
                               scoring= 'f1',
                               cv = StratifiedKFold(n_splits=3, shuffle=True),
                               n_jobs = 4,
                               verbose = 10)
# Fit grid
grid_rf.fit(X_train, y_train)

# Best score and best parameters
print('-------Best score----------')
print(grid_rf.best_score_ )
print('-------Best params----------')
print(grid_rf.best_params_)

Fitting 3 folds for each of 81 candidates, totalling 243 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:    2.2s
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:    3.1s
[Parallel(n_jobs=4)]: Done  17 tasks      | elapsed:    4.7s
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:    7.8s
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:   10.3s
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   17.3s
[Parallel(n_jobs=4)]: Done  53 tasks      | elapsed:   31.7s
[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed:   40.4s
[Parallel(n_jobs=4)]: Done  77 tasks      | elapsed:  1.0min
[Parallel(n_jobs=4)]: Done  90 tasks      | elapsed:  1.1min
[Parallel(n_jobs=4)]: Done 105 tasks      | elapsed:  1.2min
[Parallel(n_jobs=4)]: Done 120 tasks      | elapsed:  1.4min
[Parallel(n_jobs=4)]: Done 137 tasks      | elapsed:  1.7min
[Parallel(n_jobs=4)]: Done 154 tasks      | elapsed:  2.0min
[Parallel(n_jobs=4)]: Done 173 tasks      | elapsed:  2.2min
[Parallel(

-------Best score----------
0.9588701833371437
-------Best params----------
{'learning_rate': 0.1, 'max_depth': 15, 'n_estimators': 300, 'num_leaves': 200}


<h1>Prediction</h1><br>

In [15]:
X_test = X_test_pre.reset_index(drop=True)
for feature in categorical_features:
    le = LabelEncoder()
    X_test[feature]  = le.fit_transform(X_test[feature])
X_test = pd.DataFrame(Scaler.transform(X_test), columns = columns)

In [17]:
# Our final model
model = LGBMClassifier(
    learning_rate=0.1,
    max_depth=15,
    n_estimators=300,
    num_leaves=200
)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)