In [10]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit, cross_val_score
from sklearn.neural_network import MLPClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

In [11]:
df = pd.read_csv(r'train.csv')

df['type'] = ['train' for i in df.PassengerId]
test_df = pd.read_csv(r'test.csv')
test_df['type'] = ['test' for i in test_df.PassengerId]
test_df['Age'].fillna(test_df['Age'].mean())

df = pd.concat([df, test_df])


df.drop(['Ticket', 'Fare', 'Cabin'], axis = 1, inplace = True) #remove unuseful
df['baby'] = [1 if (i <= 5) else 0 for i in df['Age']]             #add features
df['adult'] = [1 if (13 < i <= 55) else 0 for i in df['Age']]
df['child'] = [1 if ( 5 < i <=  13) else 0 for i in df['Age']]
df['elderly'] = [1 if (i > 55) else 0 for i in df['Age']]
df['alone'] = [ 1 if i == 0 and j == 0 else 0 for i, j in zip(df['SibSp'], df['Parch'])]
df['family'] = [ i + j for i, j in zip(df['SibSp'], df['Parch'])]

df = pd.concat((df, pd.get_dummies(df.Sex, prefix='s')), axis=1)
df = pd.concat((df, pd.get_dummies(df.Pclass, prefix='cl')), axis=1)

df.drop(['Sex', 'Embarked', 'Age'], axis = 1, inplace = True)

num_features = ['SibSp', 'Parch', 'Pclass']
cat_features = ['baby', 'adult', 'child', 'elderly', 's_female', 's_male', 'alone']

train_df = df[df['type'] == 'train'].copy().dropna().reset_index(drop=True)
test_df = df[df['type'] == 'test'].copy().reset_index(drop=True)

y_train = train_df.Survived

X_train_cat = train_df[cat_features].values
X_test_cat = test_df[cat_features].values
X_train_num = train_df[num_features].copy()
X_test_num = test_df[num_features].copy()

X_train = np.hstack((X_train_cat, X_train_num))
X_test = np.hstack((X_test_cat, X_test_num))

model = MLPClassifier(hidden_layer_sizes=(32,), max_iter=30)#simple NN
model.fit(X_train, y_train)
forecast = model.predict(X_test)

test_df['forecast_lr'] = forecast
a = test_df[['PassengerId','forecast_lr']].copy()
a.forecast_lr = [int(i) for i in a.forecast_lr]
a.rename(columns={'forecast_lr': 'Survived'}, inplace=True)
a.to_csv(r'simple_net_res.csv', index = False)


test_df.drop(['Survived'], axis = 1, inplace = True)




In [12]:
from sklearn.linear_model import  RidgeClassifier
from sklearn.ensemble import GradientBoostingClassifier

model =  RidgeClassifier() #простая линейная регрессия
model.fit(X_train, y_train)
forecast = model.predict(X_test)


test_df['forecast_lr'] = forecast
a = test_df[['PassengerId','forecast_lr']].copy()
a.forecast_lr = [int(i) for i in a.forecast_lr]
a.rename(columns={'forecast_lr': 'Survived'}, inplace=True)
a.to_csv(r'regression_res.csv', index = False)


In [13]:
from pytorch_tabular import TabularModel
from pytorch_tabular.models import CategoryEmbeddingModelConfig
from pytorch_tabular.config import (
    DataConfig,
    OptimizerConfig,
    TrainerConfig,
    ExperimentConfig,
)

#complex NN with embedding layers

data_config = DataConfig(target=['Survived'], continuous_cols=num_features, categorical_cols=cat_features)

trainer_config = TrainerConfig(
    auto_lr_find=True,  
    max_epochs=5,
)

optimizer_config = OptimizerConfig()

model_config = CategoryEmbeddingModelConfig(task="classification", layers="64-32-32")
tabular_model = TabularModel(data_config=data_config, model_config=model_config, optimizer_config=optimizer_config, trainer_config=trainer_config)
tabular_model.fit(train=train_df, test = test_df)


pred_df = tabular_model.predict(test_df)
pred_df

2023-06-01 13:43:18,396 - {pytorch_tabular.tabular_model:102} - INFO - Experiment Tracking is turned off
Global seed set to 42
2023-06-01 13:43:18,419 - {pytorch_tabular.tabular_model:465} - INFO - Preparing the DataLoaders
2023-06-01 13:43:18,421 - {pytorch_tabular.tabular_datamodule:286} - INFO - Setting up the datamodule for classification task
2023-06-01 13:43:18,465 - {pytorch_tabular.tabular_model:508} - INFO - Preparing the Model: CategoryEmbeddingModel
2023-06-01 13:43:18,494 - {pytorch_tabular.tabular_model:264} - INFO - Preparing the Trainer
Auto select gpus: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
2023-06-01 13:43:18,573 - {pytorch_tabular.tabular_model:558} - INFO - Auto LR Find Started
  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  rank_zero_warn(
  rank_zero_warn(


Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=5` reached.
LR finder stopped early after 60 steps due to diverging loss.
Learning rate set to 0.0004365158322401656
Restoring states from the checkpoint path at C:\Users\Andrew\.lr_find_283d219d-5d58-4e73-b283-3016bd968904.ckpt
Restored all states from the checkpoint file at C:\Users\Andrew\.lr_find_283d219d-5d58-4e73-b283-3016bd968904.ckpt
2023-06-01 13:43:19,487 - {pytorch_tabular.tabular_model:560} - INFO - Suggested LR: 0.0004365158322401656. For plot and detailed analysis, use `find_learning_rate` method.
2023-06-01 13:43:19,487 - {pytorch_tabular.tabular_model:566} - INFO - Training Started
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

`Trainer.fit` stopped: `max_epochs=5` reached.


2023-06-01 13:43:21,343 - {pytorch_tabular.tabular_model:568} - INFO - Training the model completed
2023-06-01 13:43:21,343 - {pytorch_tabular.tabular_model:1207} - INFO - Loading the best model


Output()

Unnamed: 0,PassengerId,Pclass,Name,SibSp,Parch,type,baby,adult,child,elderly,...,family,s_female,s_male,cl_1,cl_2,cl_3,forecast_lr,0.0_probability,1.0_probability,prediction
0,892,3,"Kelly, Mr. James",0,0,test,0,1,0,0,...,0,0,1,0,0,1,0.0,0.842502,0.157498,0.0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",1,0,test,0,1,0,0,...,1,1,0,0,0,1,1.0,0.715690,0.284310,0.0
2,894,2,"Myles, Mr. Thomas Francis",0,0,test,0,0,0,1,...,0,0,1,0,1,0,0.0,0.578005,0.421995,0.0
3,895,3,"Wirz, Mr. Albert",0,0,test,0,1,0,0,...,0,0,1,0,0,1,0.0,0.842502,0.157498,0.0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",1,1,test,0,1,0,0,...,2,1,0,0,0,1,1.0,0.587147,0.412853,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",0,0,test,0,0,0,0,...,0,0,1,0,0,1,0.0,0.741397,0.258603,0.0
414,1306,1,"Oliva y Ocana, Dona. Fermina",0,0,test,0,1,0,0,...,0,1,0,1,0,0,1.0,0.256371,0.743629,1.0
415,1307,3,"Saether, Mr. Simon Sivertsen",0,0,test,0,1,0,0,...,0,0,1,0,0,1,0.0,0.842502,0.157498,0.0
416,1308,3,"Ware, Mr. Frederick",0,0,test,0,0,0,0,...,0,0,1,0,0,1,0.0,0.741397,0.258603,0.0


In [15]:
b = pred_df[['PassengerId','prediction']].copy()
b.prediction = b.prediction.astype(int)
b.rename(columns={'prediction': 'Survived'}, inplace=True)
b.to_csv(r'tabular_model_res.csv', index = False)