In [None]:
# Импорт необходимых библиотек

In [8]:
import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier

import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from torch.optim import Adam
from torch.optim.lr_scheduler import ReduceLROnPlateau

In [None]:
# Чтение данных из файлов 'train.csv' и 'test.csv' в объекты DataFrame

In [9]:
df = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [None]:
# Вывод первых нескольких строк DataFrame df

In [10]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [None]:
# Информация о структуре и типах данных в DataFrame df

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [None]:
# Функция для создания новых признаков на основе существующих

In [12]:
def create_features(df):
    df['Age'] = df['Age'].fillna(df['Age'].median()) # Заполнение пропущенных значений в столбце 'Age' медианным значением
    df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0]) # Заполнение пропущенных значений в столбце 'Embarked' наиболее часто встречающимся значением
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1 # Создание нового столбца 'FamilySize', содержащего информацию о размере семьи
    df['IsAlone'] = 1 # Создание нового столбца 'IsAlone', содержащего информацию о том, является ли пассажир одиноким
    df['IsAlone'].loc[df['FamilySize'] > 1] = 0
    df['Title'] = df['Name'].str.split(", ", expand=True)[1].str.split(".", expand=True)[0] # Создание нового столбца 'Title', содержащего информацию о заголовке пассажира (Mr., Mrs., и т.д.)

    df.drop(['Cabin', 'Ticket', 'Name', 'PassengerId'], axis=1, inplace=True) # Удаление ненужных столбцов 'Cabin', 'Ticket', 'Name', 'PassengerId'

In [None]:
# Применение функции create_features к DataFrame df и df_test

In [13]:
create_features(df)
create_features(df_test)

In [None]:
# Вывод первых нескольких строк DataFrame df

In [14]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,FamilySize,IsAlone,Title
0,0,3,male,22.0,1,0,7.25,S,2,0,Mr
1,1,1,female,38.0,1,0,71.2833,C,2,0,Mrs
2,1,3,female,26.0,0,0,7.925,S,1,1,Miss
3,1,1,female,35.0,1,0,53.1,S,2,0,Mrs
4,0,3,male,35.0,0,0,8.05,S,1,1,Mr


In [None]:
# Информация о структуре и типах данных в DataFrame df

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Survived    891 non-null    int64  
 1   Pclass      891 non-null    int64  
 2   Sex         891 non-null    object 
 3   Age         891 non-null    float64
 4   SibSp       891 non-null    int64  
 5   Parch       891 non-null    int64  
 6   Fare        891 non-null    float64
 7   Embarked    891 non-null    object 
 8   FamilySize  891 non-null    int64  
 9   IsAlone     891 non-null    int64  
 10  Title       891 non-null    object 
dtypes: float64(2), int64(6), object(3)
memory usage: 76.7+ KB


In [None]:
# Кодирование категориальных признаков с помощью OrdinalEncoder

In [16]:
oe = OrdinalEncoder(unknown_value=-1, handle_unknown='use_encoded_value')
cat = df.select_dtypes(include='object').columns.tolist()
df[cat] = oe.fit_transform(df[cat])
df_test[cat] = oe.transform(df_test[cat])

In [25]:
# Удаление строк с пропущенными значениями в DataFrame df
# Заполнение пропущенных значений средними значениями в DataFrame df_test

In [17]:
df.dropna(inplace=True)
df_test.fillna(df_test.mean(), inplace=True)

In [None]:
# Разделение данных на признаки (X) и целевую переменную (y) в DataFrame df

In [18]:
X = df.drop('Survived', axis=1)
y = df['Survived']

In [None]:
# Разделение данных на обучающую и валидационную выборки с помощью train_test_split

In [19]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Масштабирование числовых признаков с помощью StandardScaler

In [20]:
scaler = StandardScaler()
X_train[['Age', 'Fare']] = scaler.fit_transform(X_train[['Age', 'Fare']])
X_valid[['Age', 'Fare']] = scaler.transform(X_valid[['Age', 'Fare']])
df_test[['Age', 'Fare']] = scaler.transform(df_test[['Age', 'Fare']])

In [None]:
# Функция для обучения модели и вывода результатов

In [21]:
def fit_print(model, X_train, y_train, X_valid, y_valid):
    model.fit(X_train, y_train) # Обучение модели
    y_pred = model.predict(X_valid) # Получение предсказаний на валидационной выборке
    print(confusion_matrix(y_valid, y_pred)) # Вывод матрицы ошибок
    print(accuracy_score(y_valid, y_pred)) # Вывод значения точности
    print(classification_report(y_valid, y_pred))# Вывод отчета о классификации

In [None]:
# Создание и обучение модели логистической регрессии

In [22]:
lr = LogisticRegression()
fit_print(lr, X_train, y_train, X_valid, y_valid)

[[89 16]
 [21 53]]
0.7932960893854749
              precision    recall  f1-score   support

           0       0.81      0.85      0.83       105
           1       0.77      0.72      0.74        74

    accuracy                           0.79       179
   macro avg       0.79      0.78      0.78       179
weighted avg       0.79      0.79      0.79       179



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
# Создание и обучение модели с использованием дамми классификатора

In [23]:
dummy = DummyClassifier()
fit_print(dummy, X_train, y_train, X_valid, y_valid)

[[105   0]
 [ 74   0]]
0.5865921787709497
              precision    recall  f1-score   support

           0       0.59      1.00      0.74       105
           1       0.00      0.00      0.00        74

    accuracy                           0.59       179
   macro avg       0.29      0.50      0.37       179
weighted avg       0.34      0.59      0.43       179



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# Определение класса модели нейронной сети

In [24]:
class TitanicDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X.iloc[idx].values, self.y.iloc[idx]


class TitanicModel(nn.Module):
    def __init__(self, input_size):
        super().__init__()
        self.fc1 = nn.Linear(input_size, 32)
        self.fc2 = nn.Linear(32, 16)
        self.fc3 = nn.Linear(16, 1)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.sigmoid(self.fc3(x))
        return x


def train(model, train_loader, valid_loader, criterion, optimizer, scheduler, num_epochs): # Функция для тренировки модели
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0
        for X, y in train_loader:
            optimizer.zero_grad()
            y_pred = model(X.float())
            loss = criterion(y_pred, y.unsqueeze(1).float())
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
        train_loss /= len(train_loader)

        model.eval()
        valid_loss = 0
        with torch.no_grad():
            for X, y in valid_loader:
                y_pred = model(X.float())
                loss = criterion(y_pred, y.unsqueeze(1).float())
                valid_loss += loss.item()
            valid_loss /= len(valid_loader)

        scheduler.step(valid_loss)
        print(f'Epoch {epoch+1}/{num_epochs} | Train loss: {train_loss:.4f} | Valid loss: {valid_loss:.4f}')


def predict(model, test_loader): # Функция для получения предсказаний модели
    model.eval()
    y_pred = []
    with torch.no_grad():
        for X, _ in test_loader:
            y_pred.append(model(X.float()).squeeze(1).numpy())
    return np.concatenate(y_pred)


def fit_print_torch(model, X_train, y_train, X_valid, y_valid, num_epochs=10, batch_size=32, lr=0.001):# Функция для обучения и вывода результатов модели на основе нейронной сети
    train_dataset = TitanicDataset(X_train, y_train)
    valid_dataset = TitanicDataset(X_valid, y_valid)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)

    criterion = nn.BCELoss()
    optimizer = Adam(model.parameters(), lr=lr)
    scheduler = ReduceLROnPlateau(optimizer, mode='min', patience=3, verbose=True)

    train(model, train_loader, valid_loader, criterion, optimizer, scheduler, num_epochs)
    y_pred = predict(model, valid_loader)
    print(confusion_matrix(y_valid, y_pred.round()))
    print(accuracy_score(y_valid, y_pred.round()))
    print(classification_report(y_valid, y_pred.round()))


model = TitanicModel(X_train.shape[1]) # Создание экземпляра модели нейронной сети
fit_print_torch(model, X_train, y_train, X_valid, y_valid, num_epochs=100, batch_size=32, lr=0.001) # Обучение и вывод результатов модели

Epoch 1/100 | Train loss: 0.7771 | Valid loss: 0.6608
Epoch 2/100 | Train loss: 0.6324 | Valid loss: 0.6474
Epoch 3/100 | Train loss: 0.6233 | Valid loss: 0.6378
Epoch 4/100 | Train loss: 0.6152 | Valid loss: 0.6182
Epoch 5/100 | Train loss: 0.6071 | Valid loss: 0.6006
Epoch 6/100 | Train loss: 0.5983 | Valid loss: 0.5772
Epoch 7/100 | Train loss: 0.5837 | Valid loss: 0.5533
Epoch 8/100 | Train loss: 0.5556 | Valid loss: 0.5319
Epoch 9/100 | Train loss: 0.5399 | Valid loss: 0.5171
Epoch 10/100 | Train loss: 0.5301 | Valid loss: 0.5050
Epoch 11/100 | Train loss: 0.5170 | Valid loss: 0.4899
Epoch 12/100 | Train loss: 0.5148 | Valid loss: 0.4788
Epoch 13/100 | Train loss: 0.5022 | Valid loss: 0.4714
Epoch 14/100 | Train loss: 0.4940 | Valid loss: 0.4626
Epoch 15/100 | Train loss: 0.4800 | Valid loss: 0.4571
Epoch 16/100 | Train loss: 0.4833 | Valid loss: 0.4501
Epoch 17/100 | Train loss: 0.4626 | Valid loss: 0.4475
Epoch 18/100 | Train loss: 0.4589 | Valid loss: 0.4406
Epoch 19/100 | Trai