Для измерения качества выбрана метрика  f1-score. В данном задании нужно не только правильно предсказать выживших, но и минимизировать количество ложноположительных предсказаний, так как это может привести к неправильным выводам о выживших. А метод f1-score учитывает как точность (precision), так и полноту (recall), что делает данную метрику более сбалансированной.

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, accuracy_score
from sklearn.dummy import DummyClassifier
from sklearn.preprocessing import StandardScaler

data = pd.read_csv('train.csv')

data['Sex'] = data['Sex'].map({'male': 0, 'female': 1})
data['Embarked'] = data['Embarked'].map({'C': 0, 'Q': 1, 'S': 2})

numeric_cols = data.select_dtypes(include=['float64', 'int64']).columns
data[numeric_cols] = data[numeric_cols].fillna(data[numeric_cols].mean())

features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
X = data[features]
y = data['Survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

dummy_clf = DummyClassifier(strategy='most_frequent', random_state=42)
dummy_clf.fit(X_train, y_train)
y_dummy_pred = dummy_clf.predict(X_test)
dummy_accuracy = accuracy_score(y_test, y_dummy_pred)
dummy_f1 = f1_score(y_test, y_dummy_pred)
print(f"Dummy Classifier Accuracy: {dummy_accuracy:.4f}, F1 Score: {dummy_f1:.4f}")

model = LogisticRegression(max_iter=200, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print(f"Logistic Regression Accuracy: {accuracy:.4f}, F1 Score: {f1:.4f}")

Dummy Classifier Accuracy: 0.5866, F1 Score: 0.0000
Logistic Regression Accuracy: 0.8101, F1 Score: 0.7639
