In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import scikitplot as skplt

from sklearn.impute import KNNImputer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score as acc
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.svm import LinearSVC, SVC
from sklearn.feature_selection import mutual_info_classif, SelectFromModel, RFE
from mlxtend.feature_selection import ExhaustiveFeatureSelector
from sklearn.linear_model import LogisticRegression, LinearRegression, Lasso
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from imblearn.over_sampling import RandomOverSampler

from plots_draw import draw_cat, draw_num
from stats_methods import sigma_method

import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt

In [2]:
train_df = pd.read_csv('train.csv', sep=',', header=0, index_col=False)
cat_cols = ['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']
num_cols = ['PassengerId', 'Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare']

In [3]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,486,0,3,"Lefebre, Miss. Jeannie",female,,3,1,4133,25.4667,,S
1,426,0,3,"Wiseman, Mr. Phillippe",male,,0,0,A/4. 34244,7.25,,S
2,780,1,1,"Robert, Mrs. Edward Scott (Elisabeth Walton Mc...",female,43.0,0,1,24160,211.3375,B3,S
3,248,1,2,"Hamalainen, Mrs. William (Anna)",female,24.0,0,2,250649,14.5,,S
4,184,1,2,"Becker, Master. Richard F",male,1.0,2,1,230136,39.0,F4,S


In [4]:
train_df.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [5]:
train_df.describe(include='all')

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
count,534.0,534.0,534.0,534,534,432.0,534.0,534.0,534,534.0,120,534
unique,,,,534,2,,,,447,,102,3
top,,,,"Lefebre, Miss. Jeannie",male,,,,CA. 2343,,B96 B98,S
freq,,,,1,346,,,,6,,3,380
mean,438.878277,0.391386,2.327715,,,29.612662,0.548689,0.38015,,32.780453,,
std,256.203078,0.488518,0.839514,,,14.740699,1.206793,0.81936,,52.203248,,
min,1.0,0.0,1.0,,,0.42,0.0,0.0,,0.0,,
25%,213.25,0.0,2.0,,,20.0,0.0,0.0,,7.8958,,
50%,435.5,0.0,3.0,,,28.0,0.0,0.0,,13.0,,
75%,654.75,1.0,3.0,,,38.25,1.0,0.0,,31.275,,


In [6]:
print(f'Кол-во записей: {len(train_df)}')
train_df.isnull().sum()

Кол-во записей: 534


PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            102
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          414
Embarked         0
dtype: int64

#### Признак Cabin не имеет смысла восстанавливать, поэтому просто сносим кабину)

In [7]:
cat_cols.remove('Cabin')
train_df = train_df.dropna(axis=1, thresh=300)

#### Так же удалим столбцы которые точно не будут играть роль при прогнозирование

На счёт порта посадки я не до конца уверен, поэтому пусть пока он будет

In [8]:
cat_cols.remove('Name')
cat_cols.remove('Ticket')
num_cols.remove('PassengerId')
train_df = train_df.drop(['Name', 'Ticket', 'PassengerId'], axis=1)

#### Воспользуемся методом k-ближайших соседей для заполнения возраста

In [9]:
train_df['Age'] = train_df['Age'].fillna(train_df['Age'].mean())
# knn = KNNImputer(n_neighbors = 5)
# inputed = knn.fit_transform(train_df[num_cols])

# train_df['Age'] = inputed[:, 1]

#### Нормализируем данные

In [10]:
scaler = MinMaxScaler()

x[all_cols] = scaler.fit_transform(x[all_cols])
# train_df[num_cols] = scaler.fit_transform(train_df[num_cols])

NameError: name 'x' is not defined

#### Закодировали категориальные признаки

In [None]:
# le = LabelEncoder()
# for name in cat_cols:
#     train_df[name] = le.fit_transform(train_df[[name]])
train_df = pd.get_dummies(train_df, columns = cat_cols)
all_cols = train_df.columns[1:]
x = train_df[all_cols]
y = train_df['Survived']

#### Проведём отбор информативных признаков

##### Метод фильтрации (Inforamtion Gain)

In [None]:
importances = mutual_info_classif(x, y)
feature_importances = pd.Series(importances, train_df.columns[1:])
feature_importances.plot(kind='barh', color='teal')
plt.show()

##### Метод обёртки (recursive feature elimination)

In [None]:
lr = LinearRegression()
rfe = RFE(lr, n_features_to_select=4, step=1)
selector = rfe.fit(x, y)
selector.get_feature_names_out()

##### Встроенный метод (Регуляризация LASSO)

In [None]:
clf = Lasso(alpha=0.01)
clf.fit(x, y)
print(clf.coef_)

### Доверимся методу обёртки и удалим столбцы, которые не являются значимыми

In [None]:
# new_df = train_df.drop(columns=set(train_df.columns) - set(selector.get_feature_names_out()))
# x = new_df
x # попробовать все колонки

In [None]:
y.value_counts().plot(kind='bar')
plt.show()

### Оверсемплинг

In [None]:
# x_resampled, y_resampled = SMOTE().fit_resample(x, y)
# x = x_resampled
# y = y_resampled

ros = RandomOverSampler(random_state=0)
x_ros, y_ros= ros.fit_resample(x, y)
x = x_ros
y = y_ros

In [None]:
y.value_counts().plot(kind='bar')
plt.show()

In [None]:
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size = 0.3, random_state=0)
print(x_train)

### KNN

In [None]:
params = {'n_neighbors': range(1, 20),
          'weights': ('uniform', 'distance'),
          'metric': ('euclidean', 'manhattan', 'chebyshev'),
          'algorithm': ('auto', 'ball_tree', 'kd_tree', 'brute')}

# knn = KNeighborsClassifier()
# grid = GridSearchCV(knn, params) # указать метрику
# grid.fit(x_train, y_train)
# best_knn = grid.best_estimator_
# print(best_knn.score(x_val, y_val)) # округлить и оформить красиво


knn = KNeighborsClassifier()
 
# определение диапазона гиперпараметров
# params = {'n_neighbors': range(1, 20),
#           'weights': ['uniform', 'distance']}
 
# определение метрики оценки производительности
scoring = 'roc_auc'
 
# создание объекта GridSearchCV и обучение модели
grid = GridSearchCV(knn, params, scoring=scoring)
grid.fit(x_train, y_train)
 
# вывод лучшего набора гиперпараметров и их значения по метрике roc_auc
print('Best parameters:', grid.best_params_)
print('Best scores:', grid.best_score_)
 
# предсказание вероятностей на тестовых данных
y_pred_prob = grid.predict_proba(x_val)
best_knn = grid.best_estimator_
# вычисление метрики roc_auc
roc_auc = roc_auc_score(y_val, y_pred_prob[:,1])
print('ROC-AUC:', roc_auc)

### Логистическая регрессия

In [None]:
parameters = {'C':range(1, 20), 'penalty':(None, 'l1', 'l2', 'elasticnet'), 'solver':(['saga']), 'l1_ratio':([0.5])}

clf = LogisticRegression()
grid = GridSearchCV(clf, parameters)
grid.fit(x_train, y_train)
best_clf = grid.best_estimator_
print(best_clf.score(x_val, y_val))

### Метод опорных векторов

In [None]:
# parameters = {'kernel':('rbf', 'linear', 'poly'), 'C':(range(1, 6))}
# lsvc = SVC(probability=True)

# grid = GridSearchCV(lsvc, parameters)
# grid.fit(x_train, y_train)
# best_lsvc = grid.best_estimator_
# print(best_lsvc.score(x_val, y_val))

### Наивный байесовский классификатор

In [None]:
gaus = GaussianNB()
gaus.fit(x_train, y_train) # cross validate
print(gaus.score(x_val, y_val))

### Дискриминантный анализ

In [None]:
disc = LinearDiscriminantAnalysis()
disc.fit(x_train, y_train)
print(disc.score(x_val, y_val))

In [None]:
def roc_auc_plot(y_train, y_train_predicted, y_val, y_val_predicted, title):
    train_auc = roc_auc_score(y_train, y_train_predicted)
    test_auc = roc_auc_score(y_val, y_val_predicted)

    plt.figure(figsize=(10,7))
    plt.title(title)
    plt.plot(*roc_curve(y_train, y_train_predicted)[:2], label='train AUC={:.4f}'.format(train_auc))
    plt.plot(*roc_curve(y_val, y_val_predicted)[:2], label='test AUC={:.4f}'.format(test_auc))
    legend_box = plt.legend(fontsize='large', framealpha=1).get_frame()
    legend_box.set_facecolor("white")
    legend_box.set_edgecolor("black")
    plt.plot(np.linspace(0,1,100), np.linspace(0,1,100))
    plt.show()

In [None]:
y_train_predicted = best_knn.predict_proba(x_train)[:, 1]
y_val_predicted = best_knn.predict_proba(x_val)[:, 1]

In [None]:
roc_auc_plot(y_train, y_train_predicted, y_val, y_val_predicted, "KNN") # посмотреть встроенный метод

In [None]:
y_train_predicted = best_clf.predict_proba(x_train)[:, 1]
y_val_predicted = best_clf.predict_proba(x_val)[:, 1]
roc_auc_plot(y_train, y_train_predicted, y_val, y_val_predicted, "Logistic regression")

In [None]:
# y_train_predicted = best_lsvc.predict_proba(x_train)[:, 1]
# y_val_predicted = best_lsvc.predict_proba(x_val)[:, 1]
# roc_auc_plot(y_train, y_train_predicted, y_val, y_val_predicted, "Vectors")

In [None]:
y_train_predicted = gaus.predict_proba(x_train)[:, 1]
y_val_predicted = gaus.predict_proba(x_val)[:, 1]
roc_auc_plot(y_train, y_train_predicted, y_val, y_val_predicted, "Бейсевскийцотылва")

In [None]:
y_train_predicted = disc.predict_proba(x_train)[:, 1]
y_val_predicted = disc.predict_proba(x_val)[:, 1]
roc_auc_plot(y_train, y_train_predicted, y_val, y_val_predicted, "Дискриминация")

In [None]:
test_df = pd.read_csv('test.csv', sep=',', header=0, index_col=False)
cat_cols = ['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']
num_cols = ['PassengerId', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare']
test_df.describe(include='all')

In [None]:
cat_cols.remove('Cabin')
cat_cols.remove('Name')
cat_cols.remove('Ticket')
num_cols.remove('PassengerId')
test_id = test_df['PassengerId']
test_df = test_df.drop(['Name', 'Ticket', 'PassengerId', 'Cabin'], axis=1)

test_df['Age'] = test_df['Age'].fillna(test_df['Age'].mean())
# knn = KNNImputer(n_neighbors = 5)
# inputed = knn.fit_transform(test_df[num_cols])
# test_df['Age'] = inputed[:, 1]

test_df['Embarked'] = test_df['Embarked'].fillna(test_df['Embarked'].mode())

# for name in cat_cols:
#     test_df[name] = le.fit_transform(test_df[[name]])

test_df = pd.get_dummies(test_df, columns = cat_cols)

scaler = MinMaxScaler()
test_df[num_cols] = scaler.fit_transform(test_df[num_cols])
test_df[cat_cols] = scaler.fit_transform(test_df[cat_cols])

for name in cat_cols:
    test_df[name] = le.fit_transform(test_df[[name]])

# test_df = pd.get_dummies(test_df, columns = cat_cols)

# test_df = test_df.drop(['Age', 'Parch', 'Embarked'], axis=1)

def create_submission(id_c, pred):
    return pd.DataFrame({
        'PassengerId': id_c,
        'Survived': pred
    })

In [None]:
print(test_df)

In [None]:
sub_knn = create_submission(test_id, best_knn.predict(test_df))
sub_knn.to_csv('./knn.csv', index=False)