In [1]:
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split

In [2]:
def loadTrainData(filePath):
    data = pd.read_csv(filePath)
    # 去除Cabin、Ticket、Name三列，这三列不做分析
    data.drop(columns = ['Cabin', 'Ticket', 'Name'], axis = 1, inplace = True)
    # 处理缺失值：Age使用平均值填充，Embarked使用出现最多的填充
    data['Age'].fillna(value=data['Age'].median(), inplace = True)
    maxEmbarked = data['Embarked'].value_counts().sort_values(ascending = False).index[0]
    data['Embarked'].fillna(value = maxEmbarked, inplace = True)
    # 修改male为1，female为0
    data.loc[data['Sex']=='male', 'Sex'] = 1
    data.loc[data['Sex']=='female', 'Sex'] = 0
    # 修改C为0，Q为1，S为2
    data.loc[data['Embarked']=='C', 'Embarked'] = 0
    data.loc[data['Embarked']=='Q', 'Embarked'] = 1
    data.loc[data['Embarked']=='S', 'Embarked'] = 2
    # data[['Sex','Embarked']] = data[['Sex','Embarked']].apply(pd.to_numeric)
    data[['Sex','Embarked']] = data[['Sex','Embarked']].astype(np.int64)
    features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
    tag = ['Survived']
    # 转为矩阵
    X = np.array(data[features])
    y = np.array(data[tag]).flatten()
    return X, y

In [3]:
def loadTestData(filePath):
    data = pd.read_csv(filePath)
    data.drop(columns = ['Cabin', 'Ticket', 'Name'], inplace = True)
    # print(data.isnull().sum().sort_values(ascending = False))
    data['Age'].fillna(value = data['Age'].median(), inplace = True)
    data['Fare'].fillna(value = data['Fare'].median(), inplace = True)
    data.loc[data['Sex']=='male', 'Sex'] = 1
    data.loc[data['Sex']=='female', 'Sex'] = 0
    data.loc[data['Embarked']=='C', 'Embarked'] = 0
    data.loc[data['Embarked']=='Q', 'Embarked'] = 1
    data.loc[data['Embarked']=='S', 'Embarked'] = 2
    data[['Sex','Embarked', 'Fare']] = data[['Sex','Embarked', 'Fare']].apply(pd.to_numeric)
    features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
    X = np.array(data[features])
    return X

In [4]:
# 标准化
def norm(X_data):
    mu = X_data.mean(axis = 0)
    sigma = X_data.std(axis = 0)
    X_norm = (X_data - mu) / sigma
    return X_norm

In [5]:
class Logistic_Regression:
    def __init__(self):
        pass
    def sigmoid(self, x):
        return 1 / (1 + np.exp(-x))
    def fit(self, X, y, epochs = 500, lr = 0.01):
        '''
            epochs: 迭代次数
            lr: 学习率
        '''
        self.X = np.concatenate((X,np.ones((X.shape[0],1))), axis = 1).T
        self.w = np.zeros((self.X.shape[0], 1))
        self.y = y.reshape((y.shape[0], 1))
        for _ in range(epochs):
#             for循环求和形式：
#             ls = np.zeros((self.X.shape[0], 1))
#             for i in range(self.X.shape[1]):
#                 xi = self.X.T[i].T.reshape(8,1)
#                 yi = self.y[i].reshape(1,1)
#                 ls += (yi - self.sigmoid(self.w.T.dot(xi))) * xi
#             self.w = self.w + lr * ls
#             矩阵计算形式：
#             梯度下降更新参数
            self.w = self.w + lr * (self.X * (self.y.T - self.sigmoid(self.w.T.dot(self.X)))).sum(axis = 1).reshape(8,1)
    def predict(self, X):
        X_h = np.concatenate((X,np.ones((X.shape[0],1))), axis = 1).T
        prob = self.sigmoid(self.w.T.dot(X_h)).flatten()
        y_pred = np.array(list(map(lambda x : 1 if x >= 0.5 else 0, prob)), dtype = np.int32)
        return y_pred

In [6]:
trainFilePath = '../titanic/train.csv'
X, y = loadTrainData(trainFilePath)
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.2, random_state = 42, stratify = y)
X_train, X_valid = norm(X_train), norm(X_valid)


In [7]:
model = Logistic_Regression()
model.fit(X_train, y_train, epochs = 100, lr = 0.001)

In [8]:
y_pred = model.predict(X_valid)
acc = (y_pred==y_valid).sum() / y_valid.shape[0]
print('acc = %f.' % (acc))

acc = 0.793296.


In [9]:
testFilePath = '../titanic/test.csv'
PassengerId = pd.read_csv(testFilePath)['PassengerId']
X_test = loadTestData(testFilePath)
X_test = norm(X_test)
y_pred = model.predict(X_test)
ans = pd.DataFrame({'PassengerId' : PassengerId, 'Survived' : y_pred.astype(np.int32)})
saveFilePath = './result.csv'
ans.to_csv(saveFilePath, index = False)

# kaggle 测试集准确率 77.272%