In [487]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
import pandas as pd

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from category_encoders import TargetEncoder

from sklearn.metrics import mean_squared_error as mse

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings("ignore")

In [488]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

transformer_list = []
df_train

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False


#### Обработка датасетов

In [489]:
homeplanet_top = df_train['HomePlanet'].describe().top
destination_top  = df_train['Destination'].describe().top

def fillna_string(df: pd.DataFrame):
    df = df.copy()
    
    df['HomePlanet'] = (df['HomePlanet'].fillna(homeplanet_top))
    
    df['HomePlanet'] = df['HomePlanet'].replace('Earth', 0)
    df['HomePlanet'] = df['HomePlanet'].replace('Mars', 1)
    df['HomePlanet'] = df['HomePlanet'].replace('Europa', 2)
    
    df['Destination'] = (df['Destination'].fillna(destination_top))
    
    df['Destination'] = df['Destination'].replace('55 Cancri e', 0)
    df['Destination'] = df['Destination'].replace('PSO J318.5-22', 1)
    df['Destination'] = df['Destination'].replace('TRAPPIST-1e', 2)
    
    return df

if fillna_string not in transformer_list:
    transformer_list.append(fillna_string)

In [490]:
cryosleep_top = df_train['CryoSleep'].describe().top
vip_top = df_train['VIP'].describe().top

def fillna_bool(df: pd.DataFrame):
    df = df.copy()
    
    df['CryoSleep'] = (df['CryoSleep'].fillna(cryosleep_top)).apply(lambda x: bool(x))
    df['VIP'] = (df['VIP'].fillna(vip_top)).apply(lambda x: bool(x))
    
    return df

if fillna_bool not in transformer_list:
    transformer_list.append(fillna_bool)

In [491]:
age_mean = df_train['Age'].mean()

def fillna_float(df: pd.DataFrame):
    df = df.copy()
    
    df['Age'] = df['Age'].fillna(age_mean)
    
    df['RoomService'] = df['RoomService'].fillna(0)
    df['FoodCourt'] = df['FoodCourt'].fillna(0)
    df['ShoppingMall'] = df['ShoppingMall'].fillna(0)
    df['Spa'] = df['Spa'].fillna(0)
    df['VRDeck'] = df['VRDeck'].fillna(0)
    
    return df

if fillna_float not in transformer_list:
    transformer_list.append(fillna_float)

In [492]:
def split_passenger(df: pd.DataFrame):
    df = df.copy()
    
    df['Group_number'] = df['PassengerId'].agg(lambda x: str(x).split('_')[0] if pd.notna(x) else x)
    df['Group_index'] = df['PassengerId'].agg(lambda x: str(x).split('_')[1] if pd.notna(x) else x)
    
    df['Group_number'] = df['Group_number'].apply(lambda x: float(x))
    df['Group_index'] = df['Group_index'].apply(lambda x: float(x))
    
    return df

if split_passenger not in transformer_list:
    transformer_list.append(split_passenger)

In [493]:
def group_size(df: pd.DataFrame):
    df = df.copy()

    gs = df.groupby(['Group_number']).max()['Group_index'].rename('Group_size')

    return pd.merge(df.set_index('Group_number'), gs, left_index=True, right_index=True).reset_index()

if group_size not in transformer_list:
    transformer_list.append(group_size)

In [494]:
def split_cabins(df: pd.DataFrame):
    df = df.copy()
    
    df['Cabin_deck'] = df['Cabin'].agg(lambda x: str(x).split('/')[0] if pd.notna(x) else x)
    df['Cabin_num'] = df['Cabin'].agg(lambda x: str(x).split('/')[1] if pd.notna(x) else x)
    df['Cabin_side'] = df['Cabin'].agg(lambda x: str(x).split('/')[2] if pd.notna(x) else x)
    
    df['Cabin_num'] = df['Cabin_num'].apply(lambda x: float(x))
    
    del df['Cabin']
    
    return df

if split_cabins not in transformer_list:
    transformer_list.append(split_cabins)

In [495]:
def amenities_sum(df: pd.DataFrame):
    df = df.copy()
    
    df['AmenitiesSum'] = df['RoomService'] + df['FoodCourt'] + df['ShoppingMall'] + df['Spa'] + df['VRDeck']
    
    return df

if amenities_sum not in transformer_list:
    transformer_list.append(amenities_sum)

In [496]:
def prepare_features(df: pd.DataFrame):
    df = df.copy()
    
    del df['Name']
    df = df.set_index('PassengerId')
    
    return df

if prepare_features not in transformer_list:
    transformer_list.append(prepare_features)

In [497]:
def fillna_cabin(df: pd.DataFrame, cabin_deck_top = 'N', cabin_num_mean = -1, cabin_side_top = 'N'):
    df = df.copy()
    
    if cabin_deck_top == 'N':
        cabin_deck_top = df['Cabin_deck'].describe().top
    else:
        cabin_deck_top = cabin_deck_top
        
    if cabin_num_mean == -1:
        cabin_num_mean = df['Cabin_num'].mean()
    else:
        cabin_num_mean = cabin_num_mean
        
    if cabin_side_top == 'N':
        cabin_side_top = df['Cabin_side'].describe().top
    else:
        cabin_side_top = cabin_side_top
    
    df['Cabin_deck'] = df['Cabin_deck'].fillna(cabin_deck_top)
    df['Cabin_num'] = df['Cabin_num'].fillna(cabin_num_mean)
    df['Cabin_side'] = df['Cabin_side'].fillna(cabin_side_top)
    
    return df

In [498]:
df_train_transformed = df_train.copy()

for f in transformer_list:
    df_train_transformed = f(df_train_transformed)

df_train_transformed = fillna_cabin(df_train_transformed)
    
cabin_deck_top = df_train_transformed['Cabin_deck'].describe().top
cabin_num_mean = df_train_transformed['Cabin_num'].mean()
cabin_side_top = df_train_transformed['Cabin_side'].describe().top

encoder = LabelEncoder()

df_train_transformed['Cabin_deck'] = encoder.fit_transform(df_train_transformed['Cabin_deck'])
df_train_transformed['Cabin_side'] = encoder.fit_transform(df_train_transformed['Cabin_side'])
df_train_transformed

Unnamed: 0_level_0,Group_number,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Group_index,Group_size,Cabin_deck,Cabin_num,Cabin_side,AmenitiesSum
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
0001_01,1.0,2,False,2,39.0,False,0.0,0.0,0.0,0.0,0.0,False,1.0,1.0,1,0.0,0,0.0
0002_01,2.0,0,False,2,24.0,False,109.0,9.0,25.0,549.0,44.0,True,1.0,1.0,5,0.0,1,736.0
0003_01,3.0,2,False,2,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False,1.0,2.0,0,0.0,1,10383.0
0003_02,3.0,2,False,2,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False,2.0,2.0,0,0.0,1,5176.0
0004_01,4.0,0,False,2,16.0,False,303.0,70.0,151.0,565.0,2.0,True,1.0,1.0,5,1.0,1,1091.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9276_01,9276.0,2,False,0,41.0,True,0.0,6819.0,0.0,1643.0,74.0,False,1.0,1.0,0,98.0,0,8536.0
9278_01,9278.0,0,True,1,18.0,False,0.0,0.0,0.0,0.0,0.0,False,1.0,1.0,6,1499.0,1,0.0
9279_01,9279.0,0,False,2,26.0,False,0.0,0.0,1872.0,1.0,0.0,True,1.0,1.0,6,1500.0,1,1873.0
9280_01,9280.0,2,False,0,32.0,False,0.0,1049.0,0.0,353.0,3235.0,False,1.0,2.0,4,608.0,1,4637.0


In [499]:
df_test_transformed = df_test.copy()

for f in transformer_list:
    df_test_transformed = f(df_test_transformed)

df_test_transformed = fillna_cabin(df_test_transformed, cabin_deck_top, cabin_num_mean, cabin_side_top)

encoder = LabelEncoder()

df_test_transformed['Cabin_deck'] = encoder.fit_transform(df_test_transformed['Cabin_deck'])
df_test_transformed['Cabin_side'] = encoder.fit_transform(df_test_transformed['Cabin_side'])
df_test_transformed

Unnamed: 0_level_0,Group_number,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Group_index,Group_size,Cabin_deck,Cabin_num,Cabin_side,AmenitiesSum
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
0013_01,13.0,0,True,2,27.00000,False,0.0,0.0,0.0,0.0,0.0,1.0,1.0,6,3.000000,1,0.0
0018_01,18.0,0,False,2,19.00000,False,0.0,9.0,0.0,2823.0,0.0,1.0,1.0,5,4.000000,1,2832.0
0019_01,19.0,2,True,0,31.00000,False,0.0,0.0,0.0,0.0,0.0,1.0,1.0,2,0.000000,1,0.0
0021_01,21.0,2,False,2,38.00000,False,0.0,6652.0,0.0,181.0,585.0,1.0,1.0,2,1.000000,1,7418.0
0023_01,23.0,0,False,2,20.00000,False,10.0,0.0,635.0,0.0,0.0,1.0,1.0,5,5.000000,1,645.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9266_02,9266.0,0,True,2,34.00000,False,0.0,0.0,0.0,0.0,0.0,2.0,2.0,6,1496.000000,1,0.0
9269_01,9269.0,0,False,2,42.00000,False,0.0,847.0,17.0,10.0,144.0,1.0,1.0,5,600.367671,1,1018.0
9271_01,9271.0,1,True,0,28.82793,False,0.0,0.0,0.0,0.0,0.0,1.0,1.0,3,296.000000,0,0.0
9273_01,9273.0,2,False,2,28.82793,False,0.0,2680.0,0.0,0.0,523.0,1.0,1.0,3,297.000000,0,3203.0


#### Создание модели

In [500]:
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [501]:
X_main = df_train_transformed.drop(columns=["Transported"])
y_main = df_train_transformed["Transported"]

X_train, X_valid, y_train, y_valid = train_test_split(X_main, y_main, test_size=0.40, random_state=13, shuffle=False)
X_test = df_test_transformed

scaler = StandardScaler()
scaler.fit(X_train, y_train)

X_train = pd.DataFrame(scaler.transform(X_train), index = X_train.index, columns = X_train.columns)
X_valid = pd.DataFrame(scaler.transform(X_valid), index = X_valid.index, columns = X_valid.columns)
X_test  = pd.DataFrame(scaler.transform(X_test), index = X_test.index, columns = X_test.columns)

In [502]:
def get_predict(model, X_train, X_valid, X_test, y_train, y_valid, index = '', catboost = False):
    model = model.fit(X_train, y_train)

    train_pred = model.predict(X_train)
    valid_pred = model.predict(X_valid)

    if catboost:
        for i in range(len(train_pred)):
            if train_pred[i] == 'True':
                train_pred[i] = True
            else:
                train_pred[i] = False

        for i in range(len(valid_pred)):
            if valid_pred[i] == 'True':
                valid_pred[i] = True
            else:
                valid_pred[i] = False

    train_pred = train_pred.astype(bool)
    valid_pred = valid_pred.astype(bool)
    
    train_error_pred = f1_score(y_train, train_pred)
    valid_error_pred = f1_score(y_valid, valid_pred)
    mean_error_pred = (train_error_pred * len(X_train) + valid_error_pred * len(X_valid)) / (len(X_train) + len(X_valid))
    
    print(f"{index} {str(type(model)).split('.')[-1][:-2]}\tF1 train: {train_error_pred:.3f} valid: {valid_error_pred:.3f} mean: {mean_error_pred:.3f}")

    valid_pred = model.predict_proba(X_valid)
    test_pred  = model.predict_proba(X_test)
    
    valid_pred = pd.DataFrame(valid_pred)[1]
    test_pred = pd.DataFrame(test_pred)[1]
    
    return valid_pred, test_pred, valid_error_pred

In [503]:
model_list = []
valid_list = []
test_list = []

In [504]:
model = LogisticRegression()
valid_pred, test_pred, score = get_predict(model, X_train, X_valid, X_test, y_train, y_valid)

model_list.append(model)
valid_list.append(valid_pred)
test_list.append(test_pred)

 LogisticRegression	F1 train: 0.789 valid: 0.800 mean: 0.793


In [505]:
best_model = {0 : 0}

for n in range(2, 20):
    model = KNeighborsClassifier(n)
    valid_pred, test_pred, score = get_predict(model, X_train, X_valid, X_test, y_train, y_valid, index = f'[n = {n}]')

    if (max(best_model.keys())) < score:
        best_model[score] = n
        
best_neighbors = best_model[max(best_model.keys())]
        
print(f'Лучшая итерация c кол-вом соседей равным {best_neighbors} с скором {max(best_model.keys()):.3f}')

[n = 2] KNeighborsClassifier	F1 train: 0.852 valid: 0.735 mean: 0.805
[n = 3] KNeighborsClassifier	F1 train: 0.866 valid: 0.762 mean: 0.824
[n = 4] KNeighborsClassifier	F1 train: 0.825 valid: 0.761 mean: 0.799
[n = 5] KNeighborsClassifier	F1 train: 0.833 valid: 0.771 mean: 0.808
[n = 6] KNeighborsClassifier	F1 train: 0.806 valid: 0.770 mean: 0.792
[n = 7] KNeighborsClassifier	F1 train: 0.817 valid: 0.775 mean: 0.800
[n = 8] KNeighborsClassifier	F1 train: 0.792 valid: 0.769 mean: 0.783
[n = 9] KNeighborsClassifier	F1 train: 0.804 valid: 0.778 mean: 0.794
[n = 10] KNeighborsClassifier	F1 train: 0.790 valid: 0.773 mean: 0.784
[n = 11] KNeighborsClassifier	F1 train: 0.802 valid: 0.780 mean: 0.793
[n = 12] KNeighborsClassifier	F1 train: 0.793 valid: 0.777 mean: 0.787
[n = 13] KNeighborsClassifier	F1 train: 0.800 valid: 0.780 mean: 0.792
[n = 14] KNeighborsClassifier	F1 train: 0.794 valid: 0.779 mean: 0.788
[n = 15] KNeighborsClassifier	F1 train: 0.798 valid: 0.778 mean: 0.790
[n = 16] KNeig

In [506]:
model = KNeighborsClassifier(best_neighbors)
valid_pred, test_pred, score = get_predict(model, X_train, X_valid, X_test, y_train, y_valid)

model_list.append(model)
valid_list.append(valid_pred)
test_list.append(test_pred)

 KNeighborsClassifier	F1 train: 0.784 valid: 0.781 mean: 0.783


In [507]:
best_model = {0 : 0}

for d in range(3, 9):
    for l in range(20, 30):
        model = DecisionTreeClassifier(max_depth = d, min_samples_leaf = l)
        valid_pred, test_pred, score = get_predict(model, X_train, X_valid, X_test, y_train, y_valid, f'[d = {d} | l = {l}]')

        if (max(best_model.keys())) < score:
            best_model[score] = (d, l)

best_depth = best_model[max(best_model.keys())][0]
best_min_samples_leaf = best_model[max(best_model.keys())][1]
RandomForestClassifier()
print(f'Лучшая итерация depth = {best_depth}, min_samples_leaf = {best_min_samples_leaf} с скором {max(best_model.keys()):.3f}')

[d = 3 | l = 20] DecisionTreeClassifier	F1 train: 0.776 valid: 0.786 mean: 0.780
[d = 3 | l = 21] DecisionTreeClassifier	F1 train: 0.776 valid: 0.786 mean: 0.780
[d = 3 | l = 22] DecisionTreeClassifier	F1 train: 0.776 valid: 0.786 mean: 0.780
[d = 3 | l = 23] DecisionTreeClassifier	F1 train: 0.776 valid: 0.786 mean: 0.780
[d = 3 | l = 24] DecisionTreeClassifier	F1 train: 0.776 valid: 0.786 mean: 0.780
[d = 3 | l = 25] DecisionTreeClassifier	F1 train: 0.776 valid: 0.786 mean: 0.780
[d = 3 | l = 26] DecisionTreeClassifier	F1 train: 0.776 valid: 0.786 mean: 0.780
[d = 3 | l = 27] DecisionTreeClassifier	F1 train: 0.776 valid: 0.786 mean: 0.780
[d = 3 | l = 28] DecisionTreeClassifier	F1 train: 0.776 valid: 0.786 mean: 0.780
[d = 3 | l = 29] DecisionTreeClassifier	F1 train: 0.776 valid: 0.786 mean: 0.780
[d = 4 | l = 20] DecisionTreeClassifier	F1 train: 0.783 valid: 0.794 mean: 0.787
[d = 4 | l = 21] DecisionTreeClassifier	F1 train: 0.783 valid: 0.794 mean: 0.787
[d = 4 | l = 22] DecisionTre

In [508]:
model = DecisionTreeClassifier(max_depth = best_depth, min_samples_leaf = best_min_samples_leaf)
valid_pred, test_pred, score = get_predict(model, X_train, X_valid, X_test, y_train, y_valid)

model_list.append(model)
valid_list.append(valid_pred)
test_list.append(test_pred)

 DecisionTreeClassifier	F1 train: 0.799 valid: 0.809 mean: 0.803


In [509]:
best_model = {0 : 0}

for e in [50, 100, 200, 300, 500]:
    for d in range(6, 11, 1):
        for f in range(6, 11, 1):
            model = RandomForestClassifier(n_jobs = -1, n_estimators = e, max_depth = d, max_features = f, random_state = 13)
            valid_pred, test_pred, score = get_predict(model, X_train, X_valid, X_test, y_train, y_valid, f'[e = {e} d = {d} f = {f}]')

            if (max(best_model.keys())) < score:
                best_model[score] = (e, d, f)

best_estimators = best_model[max(best_model.keys())][0]
best_depth = best_model[max(best_model.keys())][1]
best_max_features = best_model[max(best_model.keys())][2]

print(f'Лучшая итерация estimators = {best_estimators} depth = {best_depth}, best_max_features = {best_max_features} с скором {max(best_model.keys()):.3f}')

[e = 50 d = 6 f = 6] RandomForestClassifier	F1 train: 0.817 valid: 0.816 mean: 0.816
[e = 50 d = 6 f = 7] RandomForestClassifier	F1 train: 0.822 valid: 0.819 mean: 0.821
[e = 50 d = 6 f = 8] RandomForestClassifier	F1 train: 0.821 valid: 0.817 mean: 0.820
[e = 50 d = 6 f = 9] RandomForestClassifier	F1 train: 0.825 valid: 0.820 mean: 0.823
[e = 50 d = 6 f = 10] RandomForestClassifier	F1 train: 0.825 valid: 0.820 mean: 0.823
[e = 50 d = 7 f = 6] RandomForestClassifier	F1 train: 0.839 valid: 0.821 mean: 0.832
[e = 50 d = 7 f = 7] RandomForestClassifier	F1 train: 0.838 valid: 0.817 mean: 0.830
[e = 50 d = 7 f = 8] RandomForestClassifier	F1 train: 0.838 valid: 0.817 mean: 0.830
[e = 50 d = 7 f = 9] RandomForestClassifier	F1 train: 0.840 valid: 0.821 mean: 0.833
[e = 50 d = 7 f = 10] RandomForestClassifier	F1 train: 0.843 valid: 0.823 mean: 0.835
[e = 50 d = 8 f = 6] RandomForestClassifier	F1 train: 0.859 valid: 0.814 mean: 0.841
[e = 50 d = 8 f = 7] RandomForestClassifier	F1 train: 0.858 val

In [510]:
model = RandomForestClassifier(n_jobs = -1, n_estimators = best_estimators, max_depth = best_depth, max_features = best_max_features, random_state = 13)
valid_pred, test_pred, score = get_predict(model, X_train, X_valid, X_test, y_train, y_valid)

model_list.append(model)
valid_list.append(valid_pred)
test_list.append(test_pred)

 RandomForestClassifier	F1 train: 0.843 valid: 0.823 mean: 0.835


In [511]:
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

In [512]:
best_model = {0 : 0}

for i in range(50, 80, 10):
    for lr in np.linspace(0.001, 0.3, 10):
        for d in range(2, 7):
            model = CatBoostClassifier(iterations = i, learning_rate = lr, max_depth = d, random_state = 13, verbose = 0)
            valid_pred, test_pred, score = get_predict(model, X_train, X_valid, X_test, y_train, y_valid, f'[i = {i} lr = {lr:.3f} d = {d}]', catboost = True)

            if (max(best_model.keys())) < score:
                best_model[score] = (i, lr, d)

best_iterations = best_model[max(best_model.keys())][0]
best_learning_rate = best_model[max(best_model.keys())][1]
best_depth = best_model[max(best_model.keys())][2]

print(f'Лучшая итерация iterations = {best_iterations} learning_rate = {best_learning_rate}, depth = {best_depth} с скором {max(best_model.keys()):.3f}')

[i = 50 lr = 0.001 d = 2] CatBoostClassifier	F1 train: 0.706 valid: 0.730 mean: 0.715
[i = 50 lr = 0.001 d = 3] CatBoostClassifier	F1 train: 0.706 valid: 0.730 mean: 0.715
[i = 50 lr = 0.001 d = 4] CatBoostClassifier	F1 train: 0.730 valid: 0.748 mean: 0.737
[i = 50 lr = 0.001 d = 5] CatBoostClassifier	F1 train: 0.743 valid: 0.764 mean: 0.751
[i = 50 lr = 0.001 d = 6] CatBoostClassifier	F1 train: 0.753 valid: 0.775 mean: 0.762
[i = 50 lr = 0.034 d = 2] CatBoostClassifier	F1 train: 0.724 valid: 0.746 mean: 0.733
[i = 50 lr = 0.034 d = 3] CatBoostClassifier	F1 train: 0.741 valid: 0.763 mean: 0.750
[i = 50 lr = 0.034 d = 4] CatBoostClassifier	F1 train: 0.760 valid: 0.787 mean: 0.771
[i = 50 lr = 0.034 d = 5] CatBoostClassifier	F1 train: 0.787 valid: 0.810 mean: 0.796
[i = 50 lr = 0.034 d = 6] CatBoostClassifier	F1 train: 0.790 valid: 0.814 mean: 0.799
[i = 50 lr = 0.067 d = 2] CatBoostClassifier	F1 train: 0.753 valid: 0.774 mean: 0.761
[i = 50 lr = 0.067 d = 3] CatBoostClassifier	F1 train:

In [513]:
model = CatBoostClassifier(verbose = 0, iterations = best_iterations, learning_rate = best_learning_rate, max_depth = best_depth, random_seed = 13)
valid_pred, test_pred, score = get_predict(model, X_train, X_valid, X_test, y_train, y_valid, catboost = True)

model_list.append(model)
valid_list.append(valid_pred)
test_list.append(test_pred)

 CatBoostClassifier	F1 train: 0.833 valid: 0.828 mean: 0.831


In [514]:
best_model = {0 : 0}

for l in range(10, 21, 1):
    for lr in np.linspace(0.001, 0.3, 20):
        for d in range(2, 7):
            model = LGBMClassifier(num_leaves = l, max_depth = d, learning_rate = lr, random_state = 13)
            valid_pred, test_pred, score = get_predict(model, X_train, X_valid, X_test, y_train, y_valid, f'[l = {l} lr = {lr:.3f} d = {d}]')

            if (max(best_model.keys())) < score:
                best_model[score] = (l, lr, d)

best_num_leaves = best_model[max(best_model.keys())][0]
best_learning_rate = best_model[max(best_model.keys())][1]
best_depth = best_model[max(best_model.keys())][2]

print(f'Лучшая итерация num_leaves = {best_num_leaves} learning_rate = {best_learning_rate}, depth = {best_depth} с скором {max(best_model.keys()):.3f}')

[l = 10 lr = 0.001 d = 2] LGBMClassifier	F1 train: 0.747 valid: 0.759 mean: 0.752
[l = 10 lr = 0.001 d = 3] LGBMClassifier	F1 train: 0.754 valid: 0.767 mean: 0.759
[l = 10 lr = 0.001 d = 4] LGBMClassifier	F1 train: 0.782 valid: 0.795 mean: 0.787
[l = 10 lr = 0.001 d = 5] LGBMClassifier	F1 train: 0.782 valid: 0.795 mean: 0.787
[l = 10 lr = 0.001 d = 6] LGBMClassifier	F1 train: 0.782 valid: 0.795 mean: 0.787
[l = 10 lr = 0.017 d = 2] LGBMClassifier	F1 train: 0.746 valid: 0.765 mean: 0.754
[l = 10 lr = 0.017 d = 3] LGBMClassifier	F1 train: 0.790 valid: 0.800 mean: 0.794
[l = 10 lr = 0.017 d = 4] LGBMClassifier	F1 train: 0.798 valid: 0.813 mean: 0.804
[l = 10 lr = 0.017 d = 5] LGBMClassifier	F1 train: 0.803 valid: 0.817 mean: 0.809
[l = 10 lr = 0.017 d = 6] LGBMClassifier	F1 train: 0.803 valid: 0.818 mean: 0.809
[l = 10 lr = 0.032 d = 2] LGBMClassifier	F1 train: 0.768 valid: 0.800 mean: 0.781
[l = 10 lr = 0.032 d = 3] LGBMClassifier	F1 train: 0.805 valid: 0.814 mean: 0.809
[l = 10 lr = 0.0

In [515]:
model = LGBMClassifier(num_leaves = best_num_leaves, max_depth = best_depth, learning_rate = best_learning_rate, random_state = 13)
valid_pred, test_pred, score = get_predict(model, X_train, X_valid, X_test, y_train, y_valid)

model_list.append(model)
valid_list.append(valid_pred)
test_list.append(test_pred)

 LGBMClassifier	F1 train: 0.872 valid: 0.825 mean: 0.853


In [516]:
#----------------------------------------------------------------------------------------------------------------------------------------------------------
#----------------------------------------------------------------------------------------------------------------------------------------------------------
#----------------------------------------------------------------------------------------------------------------------------------------------------------

In [517]:
df_new_valid = pd.DataFrame(valid_list[0].rename(0))
df_new_test = pd.DataFrame(test_list[0].rename(0))

for i in range(1, len(valid_list)):
    df_new_valid = pd.merge(df_new_valid, valid_list[i].rename(i), left_index=True, right_index=True)
    df_new_test = pd.merge(df_new_test, test_list[i].rename(i), left_index=True, right_index=True)

In [518]:
df_new_valid.to_csv('df_new_valid.csv')
df_new_test.to_csv('df_new_test.csv')

In [519]:
model = LinearRegression()
model = model.fit(df_new_valid, y_valid)

pred = model.predict(df_new_test)

p = []
for i in range(len(pred)):
    if pred[i] >= 0.5:
        p.append(True)
    else:
        p.append(False)

sum(p)

2185

In [520]:
result = pd.Series(p, df_test['PassengerId'], name = 'Transported')
result.to_csv('sample_submission with valid size 0,40.csv')