In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import math
import matplotlib.pyplot as plt
import re
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier

In [2]:
preprocessing_df = pd.read_csv('preprocessing_df.csv', index_col=0)
preprocessing_df.head()

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,cabin_type,title
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,0,3,male,22.0,1,0,7.25,U,Mr
2,1,1,female,38.0,1,0,71.2833,C,Mrs
3,1,3,female,26.0,0,0,7.925,U,Miss
4,1,1,female,35.0,1,0,53.1,C,Mrs
5,0,3,male,35.0,0,0,8.05,U,Mr


Убедимся в отсутствии пропусков

In [3]:
preprocessing_df[preprocessing_df.isnull().any(axis=1)]

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,cabin_type,title
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1


> Пропусков нет, можно разбивать на тест/треин и запускать в модель

In [4]:
X = preprocessing_df.drop(columns=['Survived'])
X

Unnamed: 0_level_0,Pclass,Sex,Age,SibSp,Parch,Fare,cabin_type,title
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,3,male,22.0,1,0,7.2500,U,Mr
2,1,female,38.0,1,0,71.2833,C,Mrs
3,3,female,26.0,0,0,7.9250,U,Miss
4,1,female,35.0,1,0,53.1000,C,Mrs
5,3,male,35.0,0,0,8.0500,U,Mr
...,...,...,...,...,...,...,...,...
887,2,male,27.0,0,0,13.0000,U,Officer
888,1,female,19.0,0,0,30.0000,B,Miss
889,3,female,18.0,1,2,23.4500,U,Miss
890,1,male,26.0,0,0,30.0000,C,Mr


In [5]:
y = preprocessing_df['Survived']

In [6]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y,
                                                      test_size = 0.3,
                                                      random_state = 42,
                                                      shuffle=True,
                                                      stratify = y
                                                     )
print(X_train.shape, X_valid.shape, y_train.shape, y_valid.shape)

(623, 8) (268, 8) (623,) (268,)


Будем использовать CatBoost. Для этого необходимо выделить категориальные переменные

In [7]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 623 entries, 749 to 137
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Pclass      623 non-null    int64  
 1   Sex         623 non-null    object 
 2   Age         623 non-null    float64
 3   SibSp       623 non-null    int64  
 4   Parch       623 non-null    int64  
 5   Fare        623 non-null    float64
 6   cabin_type  623 non-null    object 
 7   title       623 non-null    object 
dtypes: float64(2), int64(3), object(3)
memory usage: 43.8+ KB


In [8]:
cat_features_index = [1, 6, 7]

In [9]:
# cv_params = {'iterations': [500,600,700,800]}

model = CatBoostClassifier(
                            iterations=5000,
                            learning_rate=0.001,
                            model_size_reg=3,
                            gpu_ram_part=0.95,
                            use_best_model=True,
                            loss_function='CrossEntropy',
                            #                           eval_metric = 'accuracy',
                            leaf_estimation_method='Gradient',
                            bootstrap_type = 'Bayesian',
                            score_function = 'Cosine',
                            custom_loss='Accuracy',
                            random_seed=42
)

model.fit(X_train, y_train,
          cat_features=cat_features_index,
          eval_set=(X_valid, y_valid),
          logging_level='Silent',
          plot=True,
          
         )

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostClassifier at 0x23e4b872280>

## Подгружаем тестовый датафрэйм

In [10]:
preprocessing_df_test = pd.read_csv('preprocessing_df_test.csv', index_col=0)
preprocessing_df_test.head()

Unnamed: 0_level_0,Pclass,Sex,Age,SibSp,Parch,Fare,cabin_type,title
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
892,3,male,34.5,0,0,7.8292,U,Mr
893,3,female,47.0,1,0,7.0,U,Mrs
894,2,male,62.0,0,0,9.6875,U,Mr
895,3,male,27.0,0,0,8.6625,U,Mr
896,3,female,22.0,1,1,12.2875,U,Mrs


In [11]:
preds_test = model.predict(preprocessing_df_test)

In [12]:
id_object_test = preprocessing_df_test.index

In [13]:
result_test = pd.DataFrame({
    'PassengerId' : id_object_test,
    'Survived': preds_test,
})
result_test.head(50)

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
5,897,0
6,898,1
7,899,0
8,900,1
9,901,0


In [14]:
result_test.to_csv('submission.csv', index=False) 