Решаем задачу предсказания выжил ли пассажир в Титанике

Ссылка на соревнование: https://www.kaggle.com/competitions/titanic/overview

In [1]:
import numpy as np
import pandas as pd

#### Загрузка данных 

In [2]:
train_data = pd.read_csv("./train.csv")
test_data = pd.read_csv("./test.csv")
gender = pd.read_csv("./gender_submission.csv")

In [3]:
train_data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


#### Baseline
Построим Random Forest как бейзлайн модель для решения данной задачи. Так как построение случайного леса не требует много вычислительных ресурсов.

In [8]:
from sklearn.ensemble import RandomForestClassifier

Y_train = train_data["Survived"].copy()

features = ["Pclass", "Sex", "SibSp", "Parch"]
X_train = pd.get_dummies(train_data[features])
X_test = pd.get_dummies(test_data[features])

model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
model.fit(X_train, Y_train)
predictions = model.predict(X_test)

output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output.to_csv('submission_forest.csv', index=False)

Получили score = 0.77272. Неплохо, давайте попробуем улучшить модель



Проведем feature engineering

In [96]:
import re


def clean_data(df):
    data = df.copy()

    # Заполним Na значения в поле Age медианой
    data['Age'].fillna(data["Age"].median(), inplace=True)
    
    
    # Заполним Na значения в порту направления 
    data['Embarked'].fillna("S", inplace=True)
    
    # Создадим фичу количество родственников пассажира
    data['Family'] = data['SibSp'] + data['Parch']
    
    # Поменяем не численные признаки на численные
    g = pd.get_dummies([], data['Sex'], drop_first = True)
    e = pd.get_dummies(data['Embarked'], drop_first = True)
    p = pd.get_dummies(data['Pclass'], drop_first = True)
    data = pd.concat([g,data,e,p], axis =1)
    data['is_minor']=np.where(data['Age']<=16,1,0)
    data.columns = data.columns.astype(str)
    # вычленим Mr Miss Mrs Master Rare из имени
    data['Words_Count'] = data['Name'].apply(lambda x: len(x.split()))
    data['Title'] = data.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
    data['Title'] = data['Title'].replace(['Lady', 'Countess','Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    data['Title'] = data['Title'].replace('Mlle', 'Miss')
    data['Title'] = data['Title'].replace('Ms', 'Miss')
    data['Title'] = data['Title'].replace('Mme', 'Mrs')
    title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
    data['Title'] = data['Title'].map(title_mapping)
    data['Title'] = data['Title'].fillna(0)
       
    # Разделим на типы информацию о билетах
    data['Ticket_type'] = data['Ticket'].apply(lambda x: x[0:3])
    data['Ticket_type'] = data['Ticket_type'].astype('category')
    data['Ticket_type'] = data['Ticket_type'].cat.codes
    
    # Разделим Cabin на категории
    data['Cabin'] = data['Cabin'].fillna('U')
    data['Cabin'] = data['Cabin'].map(lambda x: re.compile("([a-zA-Z]+)").search(x).group())
    cabin_category = {'A':9, 'B':8, 'C':7, 'D':6, 'E':5, 'F':4, 'G':3, 'T':2, 'U':1}
    data['Cabin'] = data['Cabin'].map(cabin_category)


    
    # Удалим ненужные колонки
    data.drop('SibSp', axis=1, inplace=True)
    data.drop('Parch', axis=1, inplace=True)
    data.drop(["PassengerId", "Name", "Ticket",'Pclass','Embarked', 'Sex'], axis=1, inplace=True)
    return data
    
# Clean train, test data
X_train = clean_data(train_data)
X_train.drop("Survived", axis=1, inplace=True)
X_test = clean_data(test_data)
X_test['Fare'].fillna(X_test['Fare'].median(), inplace=True)

X_train.info()
Y_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Age          891 non-null    float64
 1   Fare         891 non-null    float64
 2   Cabin        891 non-null    int64  
 3   Family       891 non-null    int64  
 4   Q            891 non-null    bool   
 5   S            891 non-null    bool   
 6   2            891 non-null    bool   
 7   3            891 non-null    bool   
 8   is_minor     891 non-null    int32  
 9   Words_Count  891 non-null    int64  
 10  Title        891 non-null    int64  
 11  Ticket_type  891 non-null    int16  
dtypes: bool(4), float64(2), int16(1), int32(1), int64(4)
memory usage: 50.6 KB
<class 'pandas.core.series.Series'>
RangeIndex: 891 entries, 0 to 890
Series name: Survived
Non-Null Count  Dtype
--------------  -----
891 non-null    int64
dtypes: int64(1)
memory usage: 7.1 KB


In [82]:
from sklearn.model_selection import train_test_split
train_x, val_x, train_y, val_y = train_test_split(X_train, Y_train, random_state = 42)

In [83]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_absolute_error


random_forest = RandomForestClassifier(n_estimators=200)
random_forest.fit(train_x, train_y)

rand_pred = random_forest.predict(val_x)
print("Mean Absolute Error: " + str(1 - mean_absolute_error(rand_pred, val_y)))

Mean Absolute Error: 0.8430493273542601


In [84]:
model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
model.fit(X_train, Y_train)
predictions = model.predict(X_test)

output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output.to_csv('submission_forest_avdvanced.csv', index=False)

### Логистическая регрессия

In [97]:
import optuna
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

print(X_train.shape, Y_train.shape)

X_train, X_val, y_train, y_val = train_test_split(X_train, Y_train, test_size=0.2, random_state=42)

def objective(trial):
    # Определяем гиперпараметры для подбора
    C = trial.suggest_loguniform('C', 1e-4, 1e2)  # Пробуем различные значения C
    penalty = trial.suggest_categorical('penalty', ['l1', 'l2'])  # L1 и L2 регуляризация
    solver = trial.suggest_categorical('solver', ['liblinear', 'saga'])  # Подходящие солверы

    # Обучаем модель с выбранными гиперпараметрами
    logreg = LogisticRegression(max_iter = 100, random_state=42, C = 0.1)
    logreg.fit(X_train, y_train)

    # Предсказания и оценка
    y_pred_prob = logreg.predict(X_val)
    score = 1 - mean_absolute_error(rand_pred, val_y)

    return score

study = optuna.create_study(direction='maximize')  # Максимизируем MSE
study.optimize(objective, n_trials=100)  # Запускаем 100 итераций


# Получаем лучшие параметры
best_params = study.best_params
best_score = study.best_value

print(f'Лучшие параметры: {best_params}')
print(f'Лучший ROC AUC: {best_score}')

# Обучаем модель с лучшими параметрами
best_model = LogisticRegression(**best_params, max_iter=200)
best_model.fit(X_train, y_train)

# Оценка на валидационном наборе
y_pred_prob_best = best_model.predict_proba(X_val)[:, 1]
roc_auc_best = roc_auc_score(y_val, y_pred_prob_best)
print(f'ROC AUC на валидационном наборе с лучшими параметрами: {roc_auc_best}')

[I 2024-10-15 15:20:19,258] A new study created in memory with name: no-name-a9cc4fe1-6133-4b12-b654-9e10c340fe92
  C = trial.suggest_loguniform('C', 1e-4, 1e2)  # Пробуем различные значения C
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[I 2024-10-15 15:20:19,317] Trial 0 finished with value: 0.8430493273542601 and parameters: {'C': 0.006242924351955979, 'penalty': 'l2', 'solver': 'liblinear'}. Best is trial 0 with value: 0.8430493273542601.
  C = trial.suggest_loguniform('C', 1e-4, 1e2)  # Пробуем различные значения C
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-lea

(891, 12) (891,)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[I 2024-10-15 15:20:19,515] Trial 3 finished with value: 0.8430493273542601 and parameters: {'C': 0.42984885829204944, 'penalty': 'l2', 'solver': 'liblinear'}. Best is trial 0 with value: 0.8430493273542601.
  C = trial.suggest_loguniform('C', 1e-4, 1e2)  # Пробуем различные значения C
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Лучшие параметры: {'C': 0.006242924351955979, 'penalty': 'l2', 'solver': 'liblinear'}
Лучший ROC AUC: 0.8430493273542601
ROC AUC на валидационном наборе с лучшими параметрами: 0.8643500643500643


In [98]:
predictions = best_model.predict(X_test)

output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output.to_csv('submission_logreg_tuned.csv', index=False)

Получили score = 0.70813. Оставляем Random_Forest