In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas_profiling
import sklearn.metrics
from tqdm import tqdm
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate, StratifiedKFold
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.preprocessing import power_transform

import lightgbm as lgbm
import xgboost
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import roc_auc_score, classification_report, mean_squared_error, log_loss, make_scorer

import optuna
import optuna.integration.lightgbm as lgb

%matplotlib inline
plt.rcParams["figure.figsize"] = (15, 8)
pd.options.display.float_format = '{:.2f}'.format

In [2]:
df = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')

In [3]:
# Делаю все названия признаков, с маленькой буквы
for col in df.columns:
    if col != 'Transported':
        df[col.lower()] = df[col]
        df_test[col.lower()] = df_test[col]
        df.drop(col, axis=1, inplace=True)
        df_test.drop(col, axis=1, inplace=True)
    else:
        df[col.lower()] = df[col]
        df.drop(col, axis=1, inplace=True)
        df_test[col.lower()] = df_test[col]
        df_test.drop(col, axis=1, inplace=True)

KeyError: 'Transported'

In [4]:
numeric_df = df.select_dtypes(include=[np.number]) # Список с числовыми признаками
not_numeric_df = df.select_dtypes(exclude=[np.number]) # Список с нечисловыми признаками

In [5]:
# Заменяем все пропущенные значения в числовых столбцах на медианное значение по столбцу
df[numeric_df.columns] = df[numeric_df.columns].fillna(df.median())
df_test[numeric_df.columns] = df_test[numeric_df.columns].fillna(df_test.median())

# Заменяем все пропущенные значения в нечисловых столбцах на чаще встречающееся значение по столбцу
not_numeric_df = not_numeric_df.drop('transported', axis=1)
for col in not_numeric_df:
    df[col] = df[col].fillna(df[col].describe().top)
    df_test[col] = df_test[col].fillna(df_test[col].describe().top)

  df[numeric_df.columns] = df[numeric_df.columns].fillna(df.median())
  df_test[numeric_df.columns] = df_test[numeric_df.columns].fillna(df_test.median())


In [6]:
# Меняем тип данных для некоторых столбцов из тренировочной и тестовой выборки
df['cryosleep'] = df['cryosleep'].astype('bool')
df['age'] = df['age'].astype('int')
df['vip'] = df['vip'].astype('bool')

df_test['cryosleep'] = df_test['cryosleep'].astype('bool')
df_test['age'] = df_test['age'].astype('float64')
df_test['vip'] = df_test['vip'].astype('bool')

In [7]:
# Удаление выбросов
for col in numeric_df:
    low = df[col].quantile(0.25)
    high = df[col].quantile(0.95)
    iqr = high - low
    df[df[col] > (high + 1.5*iqr)] = np.nan
    df[df[col] < (low - 1.5*iqr)] = np.nan
    df = df[~np.isnan(df[col])]
    
    df_test[df_test[col] > (high + 1.5*iqr)] = np.nan
    df_test[df_test[col] < (low - 1.5*iqr)] = np.nan
    df_test = df_test[~np.isnan(df_test[col])]

In [8]:
# Заменя все False и True на 0 и 1
for col in df.columns:
    if True in df[col].values:
        df[col] = df[col].replace({False: 0, True: 1})
        if col != 'transported':
            df_test[col] = df_test[col].replace({False: 0, True: 1})

In [9]:
label = LabelEncoder()
onehot = OneHotEncoder()

In [10]:
df_label = label.fit_transform(df['cabin'])

In [11]:
df_label = label.fit_transform(df['cabin'])
df_test_label = label.fit_transform(df_test['cabin'])
df_onehot = onehot.fit_transform(df[['homeplanet', 'destination']])
df_test_onehot = onehot.fit_transform(df_test[['homeplanet', 'destination']])

In [12]:
columns_onehot = [*onehot.categories_[0], *onehot.categories_[1]]
data_onehot = pd.DataFrame(df_onehot.toarray(), columns=columns_onehot)
data_test_onehot = pd.DataFrame(df_test_onehot.toarray(), columns=columns_onehot)

data_label = pd.DataFrame(df_label, columns=['cabine'])
data_test_label = pd.DataFrame(df_test_label, columns=['cabine'])

In [13]:
df.drop(['cabin', 'homeplanet', 'destination'], axis=1, inplace=True)
df_test.drop(['cabin', 'homeplanet', 'destination'], axis=1, inplace=True)

In [14]:
df = pd.concat([df, data_label, data_onehot], axis=1, join='inner')
df_test = pd.concat([df_test, data_test_label, data_test_onehot], axis=1, join='inner')

In [15]:
# Меняем тип данных для преобразованных категориальных признаков
for col in ['cabine', 'Earth', 'Europa', 'Mars', '55 Cancri e', 'PSO J318.5-22', 'TRAPPIST-1e']:
    df[col] = df[col].astype('int')
    df_test[col] = df_test[col].astype('int')

In [16]:
pass_id_df_test = df_test['passengerid']

In [17]:
data = df.drop(['name', 'passengerid', 'transported'], axis=1)
data_test = df_test.drop('name', axis=1)

In [18]:
# Стандартизируем
scaler = StandardScaler()
data = scaler.fit_transform(data)
scaler.fit_transform(data_test)
data = pd.DataFrame(data, columns=['cryosleep', 'age', 'vip', 'roomservice',
       'foodcourt', 'shoppingmall', 'spa', 'vrdeck',
       'cabine', 'Earth', 'Europa', 'Mars', '55 Cancri e',
       'PSO J318.5-22', 'TRAPPIST-1e'])
data = data.abs()

data_test = pd.DataFrame(data_test, columns=['cryosleep', 'age', 'vip', 'roomservice',
                                   'foodcourt', 'shoppingmall', 'spa', 'vrdeck',
                                   'cabine', 'Earth', 'Europa', 'Mars', '55 Cancri e',
                                   'PSO J318.5-22', 'TRAPPIST-1e'])
data_test = data_test.abs()

In [19]:
# Оценка важности признаков
bestfeatures = SelectKBest(score_func=chi2, k='all')
fit = bestfeatures.fit(data, df['transported'])
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(data.columns)

In [20]:
featureScores = pd.concat([dfcolumns, dfscores], axis=1)
featureScores.columns = ['Specs', 'Score']

In [21]:
data_box_trans = power_transform(data[['roomservice', 'spa', 'vrdeck', 'foodcourt', 'shoppingmall']], method='box-cox')
data[['roomservice', 'spa', 'vrdeck', 'foodcourt', 'shoppingmall']] = data_box_trans

In [22]:
data_test_box_trans = power_transform(data_test[['roomservice', 'spa', 'vrdeck', 'foodcourt', 'shoppingmall']], method='box-cox')
data_test[['roomservice', 'spa', 'vrdeck', 'foodcourt', 'shoppingmall']] = data_test_box_trans

ValueError: The Box-Cox transformation can only be applied to strictly positive data

In [160]:
x_train, x_test, y_train, y_test = train_test_split(data, df['transported'], test_size=0.33)

<h2>Logistic Regression</h2>

In [54]:
def log_reg(trial, x_train, y_train):

    params_grid = {
        'penalty': trial.suggest_categorical('penalty', ['l2', 'none']),
        'C': trial.suggest_float('C', 0.01, 0.1)
    }

    cv = StratifiedKFold(n_splits=5, shuffle=True)
    log = LogisticRegression(**params_grid)
    scores = cross_validate(log, x_train, y_train, cv=cv, scoring=make_scorer(log_loss))

    return np.mean(scores['test_score'])

In [55]:
study = optuna.create_study(direction='maximize')
func = (lambda trial: log_reg(trial, x_train, y_train))
study.optimize(func, n_trials=100)
model_scores = []
model_scores.append(study.best_value)

[32m[I 2022-10-05 03:58:05,735][0m A new study created in memory with name: no-name-d5639da7-9a68-4bde-90b0-eafe80d44b80[0m
[32m[I 2022-10-05 03:58:05,812][0m Trial 0 finished with value: 9.411650800781976 and parameters: {'penalty': 'none', 'C': 0.031247429604320094}. Best is trial 0 with value: 9.411650800781976.[0m
[32m[I 2022-10-05 03:58:05,888][0m Trial 1 finished with value: 9.205912364254486 and parameters: {'penalty': 'l2', 'C': 0.013654483859853652}. Best is trial 0 with value: 9.411650800781976.[0m
[32m[I 2022-10-05 03:58:05,958][0m Trial 2 finished with value: 9.34308916135252 and parameters: {'penalty': 'l2', 'C': 0.023650777278719434}. Best is trial 0 with value: 9.411650800781976.[0m
[32m[I 2022-10-05 03:58:06,025][0m Trial 3 finished with value: 9.363632323332277 and parameters: {'penalty': 'none', 'C': 0.0829544144441191}. Best is trial 0 with value: 9.411650800781976.[0m
[32m[I 2022-10-05 03:58:06,105][0m Trial 4 finished with value: 9.521408182766658 

<h2>KNN</h2>

In [24]:
def knn_obj(trial, x, y):

    params_grid = {
        'n_neighbors': trial.suggest_int('n_neighbors', 1, 10),
        'weights': trial.suggest_categorical('weights', ['uniform', 'distance'])
    }
    cv = StratifiedKFold(n_splits=5, shuffle=True)
    knn = KNeighborsClassifier(**params_grid)
    scores = cross_validate(knn, x_train, y_train, scoring=make_scorer(roc_auc_score), cv=cv)

    return np.mean(scores['test_score'])

In [25]:
study = optuna.create_study(direction='maximize')
func = (lambda trial: knn_obj(trial, x_train, y_train))
study.optimize(func, n_trials=100)

[32m[I 2022-10-05 02:46:29,081][0m A new study created in memory with name: no-name-f6f82bfd-6163-4f38-b57c-c331174555ec[0m
[32m[I 2022-10-05 02:46:29,555][0m Trial 0 finished with value: 0.7221369358535324 and parameters: {'n_neighbors': 6, 'weights': 'uniform'}. Best is trial 0 with value: 0.7221369358535324.[0m
[32m[I 2022-10-05 02:46:29,935][0m Trial 1 finished with value: 0.7000878543589801 and parameters: {'n_neighbors': 4, 'weights': 'distance'}. Best is trial 0 with value: 0.7221369358535324.[0m
[32m[I 2022-10-05 02:46:30,337][0m Trial 2 finished with value: 0.7239025158832345 and parameters: {'n_neighbors': 10, 'weights': 'distance'}. Best is trial 2 with value: 0.7239025158832345.[0m
[32m[I 2022-10-05 02:46:30,673][0m Trial 3 finished with value: 0.6674504000091035 and parameters: {'n_neighbors': 1, 'weights': 'distance'}. Best is trial 2 with value: 0.7239025158832345.[0m
[32m[I 2022-10-05 02:46:31,063][0m Trial 4 finished with value: 0.7257364936770874 and 

In [26]:
model_scores.append(study.best_value)
model_scores

[0.730731128849081, 0.7508930921270341]

<h2>SVM</h2>

In [27]:
def svc_obj(trial, x, y):

    params_grid = {
        'C': trial.suggest_float('C', 0.01, 1),
        'kernel': trial.suggest_categorical('kernel', ['linear', 'rbf', 'sigmoid']),
        'gamma': trial.suggest_categorical('gamma', ['scale', 'auto'])
    }

    cv = StratifiedKFold(n_splits=5, shuffle=True)
    svc = SVC(**params_grid)
    scores = cross_validate(svc, x_train, y_train, scoring=make_scorer(roc_auc_score), cv=cv)

    return np.mean(scores['test_score'])

In [28]:
study = optuna.create_study(direction='maximize')
funcq = (lambda trial: svc_obj(trial, x_train, y_train))
study.optimize(funcq, n_trials=100)

[32m[I 2022-10-05 02:47:20,386][0m A new study created in memory with name: no-name-33a5e28b-4453-44ae-8e69-bcbf79c8db17[0m
[32m[I 2022-10-05 02:47:24,749][0m Trial 0 finished with value: 0.7584800458831957 and parameters: {'C': 0.6320673538527167, 'kernel': 'rbf', 'gamma': 'scale'}. Best is trial 0 with value: 0.7584800458831957.[0m
[32m[I 2022-10-05 02:47:27,463][0m Trial 1 finished with value: 0.7310348359700678 and parameters: {'C': 0.05304272264760601, 'kernel': 'linear', 'gamma': 'auto'}. Best is trial 0 with value: 0.7584800458831957.[0m
[32m[I 2022-10-05 02:47:31,741][0m Trial 2 finished with value: 0.7556015177637797 and parameters: {'C': 0.9112141769170116, 'kernel': 'rbf', 'gamma': 'scale'}. Best is trial 0 with value: 0.7584800458831957.[0m
[32m[I 2022-10-05 02:47:35,969][0m Trial 3 finished with value: 0.7511875206182917 and parameters: {'C': 0.3284561216887394, 'kernel': 'rbf', 'gamma': 'scale'}. Best is trial 0 with value: 0.7584800458831957.[0m
[32m[I 20

In [29]:
model_scores.append(study.best_value)

<h2>Random Forest</h2>

In [30]:
def rf_obj(trial, x, y):

    params_grid = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000, step=100),
        'max_features': trial.suggest_categorical('max_features', ['log2', 'sqrt']),
        'max_depth': trial.suggest_int('max_depth', 1, 15),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 2, 50, step=10)
    }

    rf = RandomForestClassifier(**params_grid)
    cv = StratifiedKFold(n_splits=5, shuffle=True)
    scores = cross_validate(rf, x, y, scoring=make_scorer(roc_auc_score), cv=cv)

    return np.mean(scores['test_score'])

In [31]:
study = optuna.create_study(direction='maximize')
func = lambda trial: rf_obj(trial, x_train, y_train)
study.optimize(func, n_trials=100)

[32m[I 2022-10-05 02:54:44,595][0m A new study created in memory with name: no-name-7d35eb69-11bb-4c56-aefb-fb5dc00219b7[0m
[32m[I 2022-10-05 02:54:58,216][0m Trial 0 finished with value: 0.7658831864497534 and parameters: {'n_estimators': 1000, 'max_features': 'sqrt', 'max_depth': 12, 'min_samples_leaf': 12}. Best is trial 0 with value: 0.7658831864497534.[0m
[32m[I 2022-10-05 02:55:01,673][0m Trial 1 finished with value: 0.7670106675787067 and parameters: {'n_estimators': 300, 'max_features': 'sqrt', 'max_depth': 8, 'min_samples_leaf': 32}. Best is trial 1 with value: 0.7670106675787067.[0m
[32m[I 2022-10-05 02:55:08,211][0m Trial 2 finished with value: 0.7667710495915152 and parameters: {'n_estimators': 600, 'max_features': 'sqrt', 'max_depth': 7, 'min_samples_leaf': 42}. Best is trial 1 with value: 0.7670106675787067.[0m
[32m[I 2022-10-05 02:55:15,836][0m Trial 3 finished with value: 0.7689548185484663 and parameters: {'n_estimators': 600, 'max_features': 'log2', 'max

In [32]:
model_scores.append(study.best_value)

In [33]:
model_scores

[0.730731128849081, 0.7508930921270341, 0.7609686755514365, 0.7759932305468273]

<h2>Light GBM</h2>

In [None]:
from sklearn.metrics import accuracy_score

def objective(trial):
    train_x, test_x, train_y, test_y = train_test_split(data, df['transported'], test_size=0.25)
    dtrain = lgbm.Dataset(train_x, label=train_y)
    dtest = lgbm.Dataset(test_x, label=test_y)

    param = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
        'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
        'num_leaves': trial.suggest_int('num_leaves', 2, 256),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
    }

    gbm = lgbm.train(param, dtrain, valid_sets=[dtrain, dtest])
    preds = gbm.predict(test_x)
    pred_labels = np.rint(preds)
    accuracy = roc_auc_score(test_y, pred_labels)
    return accuracy

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

In [35]:
model_scores.append(study.best_value)

In [37]:
pd.DataFrame(data=model_scores, index=['LogisticRegression', 'KNN', 'SVM', 'RF', 'LGBM'])

Unnamed: 0,0
LogisticRegression,0.73
KNN,0.75
SVM,0.76
RF,0.78
LGBM,0.8


In [163]:
from sklearn.metrics import log_loss

dtrain = lgbm.Dataset(x_train, label=y_train)
dtest = lgbm.Dataset(data_test)
model = lgbm.train(study.best_params, dtrain,  valid_sets=[dtrain, dtest])

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1629
[LightGBM] [Info] Number of data points in the train set: 5035, number of used features: 15
[LightGBM] [Info] Start training from score 0.523932
[1]	training's l2: 0.235748	valid_1's l2: 0.252627
[2]	training's l2: 0.222474	valid_1's l2: 0.220651
[3]	training's l2: 0.213081	valid_1's l2: 0.199954
[4]	training's l2: 0.203894	valid_1's l2: 0.181764
[5]	training's l2: 0.196313	valid_1's l2: 0.155355
[6]	training's l2: 0.190529	valid_1's l2: 0.131483
[7]	training's l2: 0.184359	valid_1's l2: 0.121427
[8]	training's l2: 0.180448	valid_1's l2: 0.118953
[9]	training's l2: 0.176335	valid_1's l2: 0.10718
[10]	training's l2: 0.172943	valid_1's l2: 0.0979851
[11]	training's l2: 0.170446	valid_1's l2: 0.0878547
[12]	training's l2: 0.167724	valid_1's l2: 0.07893
[13]	training's l2: 0.165609	valid_1's l2: 0.0840766
[14]	training's l2: 0.164055	valid_1's l2: 0.0890355
[15]	training's l2: 0.162068	valid_1's l2:

In [165]:
preds = model.predict(data_test)
len(np.rint(preds))

3735

In [44]:
study.best_params

{'lambda_l1': 0.0005715022633567332,
 'lambda_l2': 0.24797074796683777,
 'num_leaves': 74,
 'feature_fraction': 0.837113696668609,
 'bagging_fraction': 0.8877717337699274,
 'bagging_freq': 5,
 'min_child_samples': 5}