In [46]:
import pandas as pd
import numpy as np
import time
from scipy import spatial
from sklearn.model_selection import train_test_split

In [34]:
def loadTrainData(filePath):
    data = pd.read_csv(filePath)
    # 去除Cabin、Ticket、Name三列，这三列不做分析
    data.drop(columns = ['Cabin', 'Ticket', 'Name'], axis = 1, inplace = True)
    # 处理缺失值：Age使用平均值填充，Embarked使用出现最多的填充
    data['Age'].fillna(value=data['Age'].median(), inplace = True)
    maxEmbarked = data['Embarked'].value_counts().sort_values(ascending = False).index[0]
    data['Embarked'].fillna(value = maxEmbarked, inplace = True)
    # 修改male为1，female为0
    data.loc[data['Sex']=='male', 'Sex'] = 1
    data.loc[data['Sex']=='female', 'Sex'] = 0
    # 修改C为0，Q为1，S为2
    data.loc[data['Embarked']=='C', 'Embarked'] = 0
    data.loc[data['Embarked']=='Q', 'Embarked'] = 1
    data.loc[data['Embarked']=='S', 'Embarked'] = 2
    # data[['Sex','Embarked']] = data[['Sex','Embarked']].apply(pd.to_numeric)
    data[['Sex','Embarked']] = data[['Sex','Embarked']].astype(np.int64)
    features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
    tag = ['Survived']
    # 转为矩阵
    X = np.array(data[features])
    y = np.array(data[tag])
    return X, y

In [35]:
def loadTestData(filePath):
    data = pd.read_csv(filePath)
    data.drop(columns = ['Cabin', 'Ticket', 'Name'], inplace = True)
    # print(data.isnull().sum().sort_values(ascending = False))
    data['Age'].fillna(value = data['Age'].median(), inplace = True)
    data['Fare'].fillna(value = data['Fare'].median(), inplace = True)
    data.loc[data['Sex']=='male', 'Sex'] = 1
    data.loc[data['Sex']=='female', 'Sex'] = 0
    data.loc[data['Embarked']=='C', 'Embarked'] = 0
    data.loc[data['Embarked']=='Q', 'Embarked'] = 1
    data.loc[data['Embarked']=='S', 'Embarked'] = 2
    data[['Sex','Embarked', 'Fare']] = data[['Sex','Embarked', 'Fare']].apply(pd.to_numeric)
    features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
    X = np.array(data[features])
    return X

In [54]:
class KNN:
    def __init__(self):
        pass
    def fit(self, X, y):
        self.X = X
        self.y = y
        self.tree = spatial.KDTree(self.X)
    def predict(self, X, k = 9):
        # tree.query() 返回到最近k个点的距离和k个点的索引
        _, passengersId = self.tree.query(X, k)
        result = np.zeros(X.shape[0])
        for i in range(X.shape[0]):
            counts = [0,0]
            for ID in passengersId[i]:
                counts[self.y[ID]] += 1
            result[i] = 0 if counts[0] > counts[1] else 1
        return result


In [55]:
trainFilePath = '../titanic/train.csv'
testFilePath = '../titanic/test.csv'
X, y = loadTrainData(trainFilePath)
y = y.flatten()
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.3, random_state = 42, stratify = y)
X_test = loadTestData(testFilePath)
model = KNN()
model.fit(X_train, y_train)

In [69]:
accs = []
ks = list(range(5,30,2))
for k in ks:
    res = model.predict(X_valid, k = k)
    tot = y_valid.shape[0]
    acc = 1 - np.abs(y_valid - res).sum() / tot
    accs.append(acc)
    print('k = %d, acc = %f' % (k, acc))
accs = np.array(accs)
best_k = ks[accs.argmax()]
print('best k is %d and best acc is %f.' % (best_k, accs.max()))

k = 5, acc = 0.690299
k = 7, acc = 0.679104
k = 9, acc = 0.679104
k = 11, acc = 0.671642
k = 13, acc = 0.679104
k = 15, acc = 0.686567
k = 17, acc = 0.690299
k = 19, acc = 0.690299
k = 21, acc = 0.694030
k = 23, acc = 0.682836
k = 25, acc = 0.671642
k = 27, acc = 0.649254
k = 29, acc = 0.679104
best k is 21 and best acc is 0.694030.


In [67]:
res = model.predict(X_test, k = best_k)
PassengerId = np.array(pd.read_csv(testFilePath).loc[:, 'PassengerId'])
ans = pd.DataFrame({'PassengerId': PassengerId, 'Survived' : res.astype(np.int32)})
ans.to_csv('./result.csv', index = False)

# 准确率 65.550%

[5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29]
