In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

titanic_data = pd.read_csv('titanic.csv')
print(f"Исходный размер датасета: {titanic_data.shape[0]} строк, {titanic_data.shape[1]} столбцов")
print("\nПервые 5 строк данных:")
display(titanic_data.head())
print("\nИнформация о датасете:")
display(titanic_data.info())
print("\nСтатистика по числовым столбцам:")
display(titanic_data.describe())

Исходный размер датасета: 891 строк, 12 столбцов

Первые 5 строк данных:


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S



Информация о датасете:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


None


Статистика по числовым столбцам:


Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [13]:
initial_rows = len(titanic_data)
titanic_data = titanic_data.dropna()
dropped_rows = initial_rows - len(titanic_data)
print(f"Удалено строк с пропущенными значениями: {dropped_rows} ({dropped_rows/initial_rows*100:.2f}%)")
print(f"Осталось строк: {len(titanic_data)}")

Удалено строк с пропущенными значениями: 0 (0.00%)
Осталось строк: 183


In [14]:
cols_to_drop = ['Name', 'Ticket', 'Cabin']
print(f"Удаляемые столбцы: {cols_to_drop}")
titanic_data = titanic_data.drop(cols_to_drop, axis=1)
print("\nСтолбцы после удаления:")
print(titanic_data.columns.tolist())

Удаляемые столбцы: ['Name', 'Ticket', 'Cabin']

Столбцы после удаления:
['PassengerId', 'Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']


In [15]:
titanic_data['Sex'] = titanic_data['Sex'].map({'male': 0, 'female': 1})
titanic_data['Embarked'] = titanic_data['Embarked'].map({'C': 1, 'Q': 2, 'S': 3})

print("\nДанные после перекодировки:")
display(titanic_data.head())
titanic_data = titanic_data.drop('PassengerId', axis=1)
print(titanic_data.columns.tolist())


Данные после перекодировки:


Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
1,2,1,1,1,38.0,1,0,71.2833,1
3,4,1,1,1,35.0,1,0,53.1,3
6,7,0,1,0,54.0,0,0,51.8625,3
10,11,1,3,1,4.0,1,1,16.7,3
11,12,1,1,1,58.0,0,0,26.55,3


['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']


In [16]:
initial_total = len(pd.read_csv('titanic.csv'))
final_total = len(titanic_data)
lost_percent = ((initial_total - final_total) / initial_total) * 100
print(f"Исходное количество строк: {initial_total}")
print(f"Конечное количество строк: {final_total}")
print(f"Процент потерянных данных: {lost_percent:.2f}%")

Исходное количество строк: 891
Конечное количество строк: 183
Процент потерянных данных: 79.46%


In [17]:
X = titanic_data.drop('Survived', axis=1)
y = titanic_data['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
print(f"Обучающая выборка: {X_train.shape[0]} примеров")
print(f"Тестовая выборка: {X_test.shape[0]} примеров")

Обучающая выборка: 128 примеров
Тестовая выборка: 55 примеров


In [20]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Точность модели: {accuracy:.4f}")

print("\nМатрица ошибок:")
print(confusion_matrix(y_test, y_pred))

print(classification_report(y_test, y_pred))

Точность модели: 0.6909

Матрица ошибок:
[[13  7]
 [10 25]]
              precision    recall  f1-score   support

           0       0.57      0.65      0.60        20
           1       0.78      0.71      0.75        35

    accuracy                           0.69        55
   macro avg       0.67      0.68      0.68        55
weighted avg       0.70      0.69      0.69        55



In [21]:
X_no_embarked = X.drop('Embarked', axis=1)
X_train_ne, X_test_ne, y_train_ne, y_test_ne = train_test_split(X_no_embarked, y, test_size=0.3, random_state=42)
model_ne = LogisticRegression(max_iter=1000)
model_ne.fit(X_train_ne, y_train_ne)
y_pred_ne = model_ne.predict(X_test_ne)
accuracy_ne = accuracy_score(y_test_ne, y_pred_ne)
print(f"Точность с признаком Embarked: {accuracy:.4f}")
print(f"Точность без признака Embarked: {accuracy_ne:.4f}")
print(f"Разница в точности: {accuracy - accuracy_ne:.4f}")

# Анализ коэффициентов модели
print("\nКоэффициенты модели (влияние признаков):")
for feature, coef in zip(X.columns, model.coef_[0]):
    print(f"{feature}: {coef:.4f}")

Точность с признаком Embarked: 0.6909
Точность без признака Embarked: 0.6909
Разница в точности: 0.0000

Коэффициенты модели (влияние признаков):
Pclass: -0.5906
Sex: 2.4571
Age: -0.0250
SibSp: -0.2224
Parch: -0.3973
Fare: 0.0029
Embarked: -0.1942
