In [1]:
import pandas as pd
import numpy as np

In [2]:
    # titanic训练集所有特征如下：
    # passengerId 乘客编号
    # survived 是否存活 1是 0否
    # pclass 船舱等级  1=lst 2=2nd 3=3rd
    # name 姓名
    # sex 性别
    # age 年纪
    # sibsp 🚢上的兄弟姐妹/配偶个数
    # parch 🚢上的父母，孩子
    # ticket 船票号码
    # fare 船票价格
    # cabin 船仓号
    # embarked 登船港口  C = Cherbourg, Q = Queenstown, S = Southampton

    # Cabin船舱号有大量空值，对于空值填充可能有较大误差，所以我们先不考虑cabin作为特征
    # age，由于age缺失很少，我们使用年龄的中位数进行填充
    # passengerId是一个连续的序列，与结果无关，我们不选择这个作为特征
    # ticket是船票序列，我们不分析
    # embarked和sex这两个特征是字符串，进行处理
    # 将sex中male=1，famle=0
    # embarked中 c=0，q=1，s=2

In [3]:
def loadTrainData(filePath):
    data = pd.read_csv(filePath)
    # 去除Cabin、Ticket、Name三列，这三列不做分析
    data.drop(columns = ['Cabin', 'Ticket', 'Name'], axis = 1, inplace = True)
    # 处理缺失值：Age使用平均值填充，Embarked使用出现最多的填充
    data['Age'].fillna(value=data['Age'].median(), inplace = True)
    maxEmbarked = data['Embarked'].value_counts().sort_values(ascending = False).index[0]
    data['Embarked'].fillna(value = maxEmbarked, inplace = True)
    # 修改male为1，female为0
    data.loc[data['Sex']=='male', 'Sex'] = 1
    data.loc[data['Sex']=='female', 'Sex'] = 0
    # 修改C为0，Q为1，S为2
    data.loc[data['Embarked']=='C', 'Embarked'] = 0
    data.loc[data['Embarked']=='Q', 'Embarked'] = 1
    data.loc[data['Embarked']=='S', 'Embarked'] = 2
    # data[['Sex','Embarked']] = data[['Sex','Embarked']].apply(pd.to_numeric)
    data[['Sex','Embarked']] = data[['Sex','Embarked']].astype(np.int64)
    features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
    tag = ['Survived']
    # 感知机中正类为1，负类为-1；而当前负类为0，需要修改一下
    data.loc[data['Survived']==0,'Survived'] = -1
    # 转为矩阵
    X = np.array(data[features])
    y = np.array(data[tag])
    return X, y

In [4]:
def loadTestData(filePath):
    data = pd.read_csv(filePath)
    data.drop(columns = ['Cabin', 'Ticket', 'Name'], inplace = True)
    # print(data.isnull().sum().sort_values(ascending = False))
    data['Age'].fillna(value = data['Age'].median(), inplace = True)
    data['Fare'].fillna(value = data['Fare'].median(), inplace = True)
    data.loc[data['Sex']=='male', 'Sex'] = 1
    data.loc[data['Sex']=='female', 'Sex'] = 0
    data.loc[data['Embarked']=='C', 'Embarked'] = 0
    data.loc[data['Embarked']=='Q', 'Embarked'] = 1
    data.loc[data['Embarked']=='S', 'Embarked'] = 2
    data[['Sex','Embarked', 'Fare']] = data[['Sex','Embarked', 'Fare']].apply(pd.to_numeric)
#     print(data.info())
    features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
    X = np.array(data[features])
    return X

In [5]:
# 标准化
def norm(X_data):
    mu = X_data.mean(axis = 0)
    sigma = X_data.std(axis = 0)
    X_norm = (X_data - mu) / sigma
    return X_norm

In [6]:
class Perceptron:
    def __init__(self):
        pass
    def fit(self, X, y):
        max_epochs = 500
        lr = 0.01
        # 初始化w = 0, b = 0
        w = np.zeros((1, X.shape[1]))
        b = 0
        for epoch in range(max_epochs):
            upd = False
            for i, xi in enumerate(X):
                yi = y[i]
                # 误分类则更新
                if yi * (w @ xi.T + b) <= 0:
                    upd = True
                    w = w + lr * yi * xi
                    b = b + lr * yi
                    break
            if not upd:
                break
        correct = 0
        # 计算正确分类数量
        for i, xi in enumerate(X):
            yi = y[i]
            if yi * (w @ xi.T + b) > 0:
                correct += 1
        self.w, self.b = w, b
        print('acc =', correct / y.shape[0])
    def predict(self, X):
        result = np.zeros(X.shape[0])
        for i, xi in enumerate(X):
            if self.w @ xi.T + self.b <= 0:
                result[i] = 0
            else:
                result[i] = 1
        return result

In [7]:
trainFilePath = '../titanic/train.csv'
X_train, y_train = loadTrainData(trainFilePath)
X_train = norm(X_train)
model = Perceptron()
model.fit(X_train, y_train)

acc = 0.7115600448933782


In [8]:
testFilePath = '../titanic/test.csv'
PassengerId = pd.read_csv(testFilePath)['PassengerId']
X_test = loadTestData(testFilePath)
X_test = norm(X_test)
res = model.predict(X_test)
ans = pd.DataFrame({'PassengerId' : PassengerId, 'Survived' : res.astype(np.int32)})
saveFilePath = './result.csv'
ans.to_csv(saveFilePath, index = False)

# 最终准确率71.531%