In [4]:
import os
import cv2
import numpy as np
import missingno as msno
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import matplotlib
plt.style.use('ggplot')
from matplotlib.pyplot import figure
import pandas as pd
from scipy.stats import mannwhitneyu

from sklearn.model_selection import train_test_split, cross_val_score, KFold  # Модули для разделения данных и кросс-валидации
from sklearn.neighbors import KNeighborsClassifier  # Классификатор K-ближайших соседей
from sklearn.metrics import accuracy_score  # Модуль для оценки точности модели



In [9]:
train_data =  pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
features = ['Sex', 'Pclass', 'SibSp', 'Parch']
y = train_data['Survived']
X = pd.get_dummies(train_data[features])
X_test = pd.get_dummies(test_data[features])

In [18]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
model.fit(X, y) #обучаем модель
prediction = model.predict(X_test) #делаем предсказание
output = pd.DataFrame({'PassengerId':test_data.PassengerId, 'Survived':prediction})

print(f"Accuracy: {accuracy_score(X_test, output)}")


ValueError: multiclass-multioutput is not supported

In [35]:
y = train_data['Survived']
X = pd.get_dummies(train_data[features])

# Разделение данных на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Обучение и оценка модели K-ближайших соседей
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)  # Обучение модели
y_pred = knn.predict(X_test)  # Прогнозирование на тестовой выбор-ке
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")  # Вывод точно-сти модели




Accuracy: 0.7932960893854749


In [36]:
k_values = [1, 3, 5, 7, 9, 11, 13, 15]
test_sizes = [0.1, 0.2, 0.3, 0.4]
results_hold_out = []
for k in k_values:
    for test_size in test_sizes:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
        knn = KNeighborsClassifier(n_neighbors=k)
        knn.fit(X_train, y_train)
        y_pred = knn.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        results_hold_out.append((k, test_size, accuracy))
# Создание DataFrame с результатами hold-out валидации
df_hold_out = pd.DataFrame(results_hold_out, columns=['k', 'test_size', 'accuracy'])
print(df_hold_out)


     k  test_size  accuracy
0    1        0.1  0.811111
1    1        0.2  0.798883
2    1        0.3  0.750000
3    1        0.4  0.697479
4    3        0.1  0.811111
5    3        0.2  0.793296
6    3        0.3  0.761194
7    3        0.4  0.756303
8    5        0.1  0.777778
9    5        0.2  0.782123
10   5        0.3  0.757463
11   5        0.4  0.770308
12   7        0.1  0.811111
13   7        0.2  0.776536
14   7        0.3  0.753731
15   7        0.4  0.781513
16   9        0.1  0.800000
17   9        0.2  0.776536
18   9        0.3  0.757463
19   9        0.4  0.787115
20  11        0.1  0.811111
21  11        0.2  0.776536
22  11        0.3  0.753731
23  11        0.4  0.775910
24  13        0.1  0.788889
25  13        0.2  0.782123
26  13        0.3  0.794776
27  13        0.4  0.775910
28  15        0.1  0.777778
29  15        0.2  0.782123
30  15        0.3  0.794776
31  15        0.4  0.770308


In [37]:
k_values = [1, 3, 5, 7, 9, 11, 13, 15]
folds = [3, 5, 7, 10]
results_cv = []

for k in k_values:
    for fold in folds:
        knn = KNeighborsClassifier(n_neighbors=k)
        cv = KFold(n_splits=fold, shuffle=True, random_state=42)
        cv_scores = cross_val_score(knn, X, y, cv=cv)
        results_cv.append((k, fold, np.mean(cv_scores)))
# Создание DataFrame с результатами кросс-валидации
df_cv = pd.DataFrame(results_cv, columns=['k', 'fold', 'mean_accuracy'])
print(df_cv)


     k  fold  mean_accuracy
0    1     3       0.735129
1    1     5       0.709347
2    1     7       0.711509
3    1    10       0.717129
4    3     3       0.762065
5    3     5       0.738522
6    3     7       0.731774
7    3    10       0.741798
8    5     3       0.794613
9    5     5       0.787885
10   5     7       0.790135
11   5    10       0.781136
12   7     3       0.785634
13   7     5       0.789015
14   7     7       0.792358
15   7    10       0.794607
16   9     3       0.786756
17   9     5       0.790139
18   9     7       0.792376
19   9    10       0.790125
20  11     3       0.786756
21  11     5       0.785644
22  11     7       0.789010
23  11    10       0.785618
24  13     3       0.786756
25  13     5       0.791250
26  13     7       0.793500
27  13    10       0.792360
28  15     3       0.786756
29  15     5       0.785644
30  15     7       0.793474
31  15    10       0.788989


In [38]:
optimal_k = df_cv.loc[df_cv['mean_accuracy'].idxmax()]['k']
print(f"Оптимальное значение K: {optimal_k}")
# Финальная модель с оптимальным K
final_knn = KNeighborsClassifier(n_neighbors=int(optimal_k))
final_knn.fit(X_train, y_train)
y_final_pred = final_knn.predict(X_test)
print(f"Финальная точность модели: {accuracy_score(y_test, y_final_pred)}")


Оптимальное значение K: 5.0
Финальная точность модели: 0.7703081232492998
