## Задание 4.1


In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.simplefilter("ignore", UserWarning)

import matplotlib.pyplot as plt

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import GridSearchCV, train_test_split, RandomizedSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import make_regression
from sklearn import metrics

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [2]:
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


### Исключение признаков, которые могут привести к переобучению

In [3]:
print(f'Общее количество данных: {len(df)}')

Общее количество данных: 891


In [4]:
df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1, inplace=True)
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


### EDA

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    object 
 3   Age       714 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
 7   Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(2)
memory usage: 55.8+ KB


Видно, что не все столбцы заполнены данными. Оценим какое количество нулей в наших данных.

In [6]:
df.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64

Два признака содержат записи с нулевыми значениями. Мы можем убрать эти записи или заполнить их средним значением.

In [7]:
df_2 = df.dropna()
df_2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 712 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  712 non-null    int64  
 1   Pclass    712 non-null    int64  
 2   Sex       712 non-null    object 
 3   Age       712 non-null    float64
 4   SibSp     712 non-null    int64  
 5   Parch     712 non-null    int64  
 6   Fare      712 non-null    float64
 7   Embarked  712 non-null    object 
dtypes: float64(2), int64(4), object(2)
memory usage: 50.1+ KB


После удаления осталось 712 записей.

In [8]:
df_2.Survived.value_counts(normalize=True)*100

0    59.550562
1    40.449438
Name: Survived, dtype: float64

Данные с нужными нам классами распределены в соотношении 60% к 40%.

In [9]:
df_2.corr()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare
Survived,1.0,-0.356462,-0.082446,-0.015523,0.095265,0.2661
Pclass,-0.356462,1.0,-0.365902,0.065187,0.023666,-0.552893
Age,-0.082446,-0.365902,1.0,-0.307351,-0.187896,0.093143
SibSp,-0.015523,0.065187,-0.307351,1.0,0.383338,0.13986
Parch,0.095265,0.023666,-0.187896,0.383338,1.0,0.206624
Fare,0.2661,-0.552893,0.093143,0.13986,0.206624,1.0


### Разделение выборки на train и test

In [10]:
X = df_2.drop(["Survived"], axis=1)
y = df_2["Survived"]

### Преобразование признаков

In [11]:
ohe = OneHotEncoder()
ohe_transformed = ohe.fit_transform(X[['Sex', 'Embarked']])
print(ohe_transformed.toarray())

[[0. 1. 0. 0. 1.]
 [1. 0. 1. 0. 0.]
 [1. 0. 0. 0. 1.]
 ...
 [1. 0. 0. 0. 1.]
 [0. 1. 1. 0. 0.]
 [0. 1. 0. 1. 0.]]


In [12]:
X[ohe.categories_[0]] = ohe_transformed.toarray()[:,:2]
X[ohe.categories_[1]] = ohe_transformed.toarray()[:,2:]

In [13]:
X.drop(['Sex', 'Embarked'], axis=1, inplace=True)
X.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,female,male,C,Q,S
0,3,22.0,1,0,7.25,0.0,1.0,0.0,0.0,1.0
1,1,38.0,1,0,71.2833,1.0,0.0,1.0,0.0,0.0
2,3,26.0,0,0,7.925,1.0,0.0,0.0,0.0,1.0
3,1,35.0,1,0,53.1,1.0,0.0,0.0,0.0,1.0
4,3,35.0,0,0,8.05,0.0,1.0,0.0,0.0,1.0


In [14]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 712 entries, 0 to 890
Data columns (total 10 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Pclass  712 non-null    int64  
 1   Age     712 non-null    float64
 2   SibSp   712 non-null    int64  
 3   Parch   712 non-null    int64  
 4   Fare    712 non-null    float64
 5   female  712 non-null    float64
 6   male    712 non-null    float64
 7   C       712 non-null    float64
 8   Q       712 non-null    float64
 9   S       712 non-null    float64
dtypes: float64(7), int64(3)
memory usage: 61.2 KB


In [15]:
# Scaling train dataset
scaler = StandardScaler()
column_names = X.columns
X = scaler.fit_transform(X)
X = pd.DataFrame(X)
X.columns = column_names
X.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,female,male,C,Q,S
0,0.9086,-0.527669,0.522511,-0.506787,-0.51638,-0.756138,0.756138,-0.472618,-0.202326,0.53404
1,-1.482983,0.577094,0.522511,-0.506787,0.694046,1.322511,-1.322511,2.115874,-0.202326,-1.872519
2,0.9086,-0.251478,-0.552714,-0.506787,-0.50362,1.322511,-1.322511,-0.472618,-0.202326,0.53404
3,-1.482983,0.369951,0.522511,-0.506787,0.350326,1.322511,-1.322511,-0.472618,-0.202326,0.53404
4,0.9086,0.369951,-0.552714,-0.506787,-0.501257,-0.756138,0.756138,-0.472618,-0.202326,0.53404


### Разделение данных на train и test

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=11)

### Обучение логистической регрессии

In [17]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

LogisticRegression()

In [18]:
def compute_metrics(predictions, y):
    accuracy = metrics.accuracy_score(y,predictions)
    precision = metrics.precision_score(y,predictions)
    recall = metrics.recall_score(y,predictions)
    f1_score = metrics.f1_score(y,predictions)
    confusion_matrix = metrics.confusion_matrix(y,predictions)
    
    print(f'''Accuracy:  {np.round(accuracy,2)}
Precision: {np.round(precision,2)}
Recall:    {np.round(recall,2)}
F1_score:  {np.round(f1_score,2)}

confusion_matrix {confusion_matrix[0]}
                 {confusion_matrix[1]}''')

In [19]:
y_train_predictions = logreg.predict(X_train)
print('Train metrics:')
compute_metrics(y_train_predictions, y_train)

Train metrics:
Accuracy:  0.8
Precision: 0.77
Recall:    0.69
F1_score:  0.73

confusion_matrix [268  40]
                 [ 58 132]


In [20]:
y_test_predictions = logreg.predict(X_test)
print('Test metrics:')
compute_metrics(y_test_predictions, y_test)

Test metrics:
Accuracy:  0.79
Precision: 0.8
Recall:    0.71
F1_score:  0.76

confusion_matrix [99 17]
                 [28 70]


### Обучение KNN

In [21]:
knn = KNeighborsRegressor()
knn.fit(X_train, y_train)

KNeighborsRegressor()

In [22]:
y_train_predictions = knn.predict(X_train)
y_train_predictions = [1 if x >= 0.5 else 0 for x in y_train_predictions]
print('Train metrics:')
compute_metrics(y_train_predictions, y_train)

Train metrics:
Accuracy:  0.85
Precision: 0.83
Recall:    0.75
F1_score:  0.79

confusion_matrix [278  30]
                 [ 47 143]


In [23]:
y_test_predictions = knn.predict(X_test)
y_test_predictions = [1 if x >= 0.5 else 0 for x in y_test_predictions]
print('Test metrics:')
compute_metrics(y_test_predictions, y_test)

Test metrics:
Accuracy:  0.82
Precision: 0.84
Recall:    0.76
F1_score:  0.8

confusion_matrix [102  14]
                 [24 74]


Модель KNN со стандартными параметрами обучилась лучше, чем логистическая регрессия.

### Подбор гиперпараметров

Логистическая регрессия

In [24]:
params = dict(
    penalty=['l1', 'l2', 'elasticnet', None],
    solver=['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'],
    max_iter=range(80, 120),
)

In [25]:
logreg = LogisticRegression()
cv = RandomizedSearchCV(logreg, params, n_jobs=-1, cv=5)
cv.fit(X_train, y_train)

RandomizedSearchCV(cv=5, estimator=LogisticRegression(), n_jobs=-1,
                   param_distributions={'max_iter': range(80, 120),
                                        'penalty': ['l1', 'l2', 'elasticnet',
                                                    None],
                                        'solver': ['lbfgs', 'liblinear',
                                                   'newton-cg',
                                                   'newton-cholesky', 'sag',
                                                   'saga']})

In [26]:
best_params = cv.best_params_
best_params

{'max_iter': 93, 'penalty': 'l1', 'solver': 'saga'}

In [27]:
logreg = LogisticRegression(**best_params)
logreg.fit(X_train, y_train)

LogisticRegression(max_iter=93, penalty='l1', solver='saga')

In [28]:
y_train_predictions = logreg.predict(X_train)
print('Train metrics:')
compute_metrics(y_train_predictions, y_train)

Train metrics:
Accuracy:  0.81
Precision: 0.77
Recall:    0.7
F1_score:  0.73

confusion_matrix [269  39]
                 [ 57 133]


In [29]:
y_test_predictions = logreg.predict(X_test)
print('Test metrics:')
compute_metrics(y_test_predictions, y_test)

Test metrics:
Accuracy:  0.79
Precision: 0.81
Recall:    0.72
F1_score:  0.76

confusion_matrix [99 17]
                 [27 71]


KNN

In [30]:
params = dict(
    n_neighbors=range(1, 40),
    weights=['uniform', 'distance'],
    algorithm=['auto', 'ball_tree', 'kd_tree', 'brute'],
    leaf_size=range(20, 40),
    p=[1, 2],
)

In [31]:
knn = KNeighborsRegressor()
cv = RandomizedSearchCV(knn, params, n_jobs=-1, cv=5)
cv.fit(X_train, y_train)

RandomizedSearchCV(cv=5, estimator=KNeighborsRegressor(), n_jobs=-1,
                   param_distributions={'algorithm': ['auto', 'ball_tree',
                                                      'kd_tree', 'brute'],
                                        'leaf_size': range(20, 40),
                                        'n_neighbors': range(1, 40),
                                        'p': [1, 2],
                                        'weights': ['uniform', 'distance']})

In [32]:
best_params = cv.best_params_
best_params

{'algorithm': 'kd_tree',
 'leaf_size': 30,
 'n_neighbors': 25,
 'p': 1,
 'weights': 'uniform'}

In [33]:
knn = KNeighborsRegressor(**best_params)
knn.fit(X_train, y_train)

KNeighborsRegressor(algorithm='kd_tree', n_neighbors=25, p=1)

In [34]:
y_train_predictions = knn.predict(X_train)
y_train_predictions = [1 if x >= 0.5 else 0 for x in y_train_predictions]
print('Train metrics:')
compute_metrics(y_train_predictions, y_train)

Train metrics:
Accuracy:  0.81
Precision: 0.8
Recall:    0.65
F1_score:  0.72

confusion_matrix [278  30]
                 [ 67 123]


In [35]:
y_test_predictions = knn.predict(X_test)
y_test_predictions = [1 if x >= 0.5 else 0 for x in y_test_predictions]
print('Test metrics:')
compute_metrics(y_test_predictions, y_test)

Test metrics:
Accuracy:  0.8
Precision: 0.85
Recall:    0.68
F1_score:  0.76

confusion_matrix [104  12]
                 [31 67]


Вывод: после подбора гиперпараметров немного изменились итоговые характеристики обучения, метот KNN по прежнему выдает лучший результат.