In [2]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score


train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')  # no 'Survived' col
label_test = pd.read_csv('titanic.csv')
label_test = label_test['Survived']


# [2 Marks] [Remove Features]
train_data = train_data.drop(['Cabin', 'Name', 'Ticket'], axis=1)  # 去掉列特征Name、Ticket、Cabin
test_data = test_data.drop(['Cabin', 'Name', 'Ticket'], axis=1)  # 去掉列特征Name、Ticket、Cabin


# [6 Marks] [Missing Values] 缺失值处理

# a.Age       mean
age_imputer = SimpleImputer(strategy='mean')
train_data['Age'] = age_imputer.fit_transform(train_data[['Age']])
test_data['Age'] = age_imputer.fit_transform(test_data[['Age']])

# b.Fare      mean
fare_imputer = SimpleImputer(strategy='mean')
train_data['Fare'] = fare_imputer.fit_transform(train_data[['Fare']])
test_data['Fare'] = fare_imputer.fit_transform(test_data[['Fare']])

# c.Embarked  most_freq
embarked_most_freq = train_data['Embarked'].value_counts().idxmax()
train_data['Embarked'] = train_data['Embarked'].fillna(embarked_most_freq)
test_data['Embarked'] = test_data['Embarked'].fillna(embarked_most_freq)


# [6 Marks] [Encoding Categorical Variables] 变量编码
sex_mapping = {'female': 0, 'male': 1}
train_data['Sex'] = train_data['Sex'].map(sex_mapping)
test_data['Sex'] = test_data['Sex'].map(sex_mapping)
embarked_mapping = {'S': 0, 'C': 1, 'Q': 2}
train_data['Embarked'] = train_data['Embarked'].map(embarked_mapping)
test_data['Embarked'] = test_data['Embarked'].map(embarked_mapping)


# [6 Marks]
"""
# a. (what?why contains -1?)
train_data = data[data['Survived'] == 0]
test_data = data[data['Survived'] == 1]
"""
# b.
feature_train = train_data.drop('Survived', axis=1)
label_train = train_data['Survived']
feature_test = test_data


# c.
train_data = train_data.drop('PassengerId', axis=1)
test_id = test_data['PassengerId']
test_data = test_data.drop('PassengerId', axis=1)


# [8 Marks]
"""
k_best = 1
accuracy_best = 0
for k in range(1, 20):
    knn = KNeighborsClassifier(n_neighbors=k)

    knn.fit(feature_train, label_train)

    label_pred = knn.predict(feature_test)

    accuracy = accuracy_score(label_test, label_pred)
    if accuracy > accuracy_best:
        k_best = k
        accuracy_best = accuracy

print("k_best = ", k_best, ": "f'Accuracy:{accuracy_best:.2f}')
"k_best =  7 : Accuracy:0.67"
"""

knn = KNeighborsClassifier(n_neighbors=7)

knn.fit(feature_train, label_train)

label_pred = knn.predict(feature_test)

accuracy = accuracy_score(label_test, label_pred)

print("k = 7, accuracy = ", accuracy)

submission = pd.DataFrame({
    'PassengerId': test_id,
    'Survived': label_pred
})

# 写入 CSV 文件
submission.to_csv('submission.csv', index=False,header=True, mode='w')
print("Submission file created: 'submission.csv'")


k = 7, accuracy =  0.6674641148325359
Submission file created: 'submission.csv'
