In [3]:
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn import metrics

class titanic():
    def __init__(self, train_path, test_path):
        self.train = pd.read_csv(train_path)
        self.test = pd.read_csv(test_path)
        self.train_x, self.test_x, self.train_y, self.test_y, self.predict_x = self.data_clean()
        self.result = self.training()
        self.evaluation()
        self.predict()

    def data_clean(self):
        # 使用中位数填充训练集和测试集的‘Age'列
        self.train['Age'].fillna(self.train['Age'].median(), inplace = True)
        self.test['Age'].fillna(self.test['Age'].median(), inplace = True)
        # 使用中位数填充测试集的'Fare'列
        self.test['Fare'].fillna(self.test['Fare'].median(), inplace = True)
        # 使用出现最多的港口'S'来填充训练集的'Embarked'列
        self.train['Embarked'].fillna('S', inplace = True)
        # 删除训练集和测试集中数据量过少的‘Cabin'列
        self.train.drop(['Cabin'], axis = 1, inplace = True)
        self.test.drop(['Cabin'], axis = 1, inplace = True)
        # 定义模型特征
        features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
        # 定义训练集特征数据
        train_data = self.train[features]
        # 定义训练集标签
        train_target = self.train['Survived']
        # 定义预测数据特征
        predict_data = self.test[features]
        # 提取特征,产生非稀疏矩阵
        vec = DictVectorizer(sparse = False)
        # 将训练集数据与预测数据进行预处理，转化为特征向量
        train_data = vec.fit_transform(train_data.to_dict(orient = 'record'))
        predict_data = vec.fit_transform(predict_data.to_dict(orient = 'record'))
        # 将训练集数据再次细分为训练集和测试集，此处并不包括预测集，细分目的是为了便于评估模型结果
        train_x, test_x, train_y, test_y = train_test_split(train_data, train_target, test_size = 0.25)
        # Z-score标准化训练集、测试集以及预测集
        sd = preprocessing.StandardScaler()
        train_sd_x = sd.fit_transform(train_x)
        test_sd_x = sd.transform(test_x)
        predict_sd_x = sd.transform(predict_data)
        return train_sd_x, test_sd_x, train_y, test_y, predict_sd_x

    def training(self):
        # 初始化训练结果
        result_all = []
        # 使用CART决策树算法
        from sklearn.tree import DecisionTreeClassifier
        dt = DecisionTreeClassifier()
        dt.fit(self.train_x, self.train_y)
        result_dt = dt.predict(self.test_x)
        result_all.append(result_dt)
        # 使用逻辑回归算法
        from sklearn.linear_model import LogisticRegression
        lr = LogisticRegression(solver = 'lbfgs')
        lr.fit(self.train_x, self.train_y)
        result_lr = lr.predict(self.test_x)
        result_all.append(result_lr)
        # 使用LDA算法
        from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
        lda = LinearDiscriminantAnalysis()
        lda.fit(self.train_x, self.train_y)
        result_lda = lda.predict(self.test_x)
        result_all.append(result_lda)
        # 使用朴素贝叶斯算法
        from sklearn.naive_bayes import BernoulliNB
        nb = BernoulliNB()
        nb.fit(self.train_x, self.train_y)
        result_nb = nb.predict(self.test_x)
        result_all.append(result_nb)
        # 使用SVM算法
        from sklearn import svm
        svec = svm.SVC()
        svec.fit(self.train_x, self.train_y)
        result_svec = svec.predict(self.test_x)
        result_all.append(result_svec)
        # 使用KNN算法
        from sklearn.neighbors import KNeighborsClassifier
        knn = KNeighborsClassifier()
        knn.fit(self.train_x, self.train_y)
        result_knn = knn.predict(self.test_x)
        result_all.append(result_knn)
        # 使用AdaBoost算法
        from sklearn.ensemble import AdaBoostClassifier
        ada = AdaBoostClassifier()
        ada.fit(self.train_x, self.train_y)
        result_ada = ada.predict(self.test_x)
        result_all.append(result_ada)
        # 使用XGBoost算法
        from xgboost import XGBClassifier
        xg = XGBClassifier()
        xg.fit(self.train_x, self.train_y)
        result_xg = xg.predict(self.test_x)
        result_all.append(result_xg)
        # 返回所有算法训练结果
        return result_all

    def evaluation(self):
        # 初始化评估结果
        all = []
        # 评估上述每种算法的指标，将其放入dict中
        for i in self.result:
            eval = {}
            eval['accuracy'] = metrics.accuracy_score(i, self.test_y) * 100
            eval['precision'] = metrics.precision_score(i, self.test_y) * 100
            eval['recall'] = metrics.recall_score(i, self.test_y) * 100
            eval['F1'] = metrics.f1_score(i, self.test_y) * 100
            # 将算法评估指标的dict放入评估结果list
            all.append(eval)
        # 将评估结果list转化为DataFrame展示
        print (pd.DataFrame(all, index=['DecisionTree', 'LogisticRegression', 'LDA', 'NaiveBayes', 'SVM', 'KNN', 'AdaBoost', 'XGBoost']))

    def predict(self):
        # 使用TPOT预测数据
        from tpot import TPOTClassifier
        tp = TPOTClassifier(generations = 5, population_size = 20, verbosity = 2)
        tp.fit(self.train_x, self.train_y)
        result_tp = tp.predict(self.predict_x)
        # 将预测结果添加至预测集数据
        self.test['Survived'] = result_tp
        # 输出预测结果
        self.test['Survived'].replace([0, 1], ['遇难', '幸存'], inplace = True)
        predict_result = self.test
        print (predict_result)

if __name__ == '__main__':
    titanic = titanic('C:/Users/Administrator/Desktop/RS/L2Data/titan/train.csv', 
                      'C:/Users/Administrator/Desktop/RS/L2Data/titan/test.csv')



                     accuracy  precision     recall         F1
DecisionTree        74.887892  66.666667  65.060241  65.853659
LogisticRegression  82.062780  72.839506  76.623377  74.683544
LDA                 81.165919  71.604938  75.324675  73.417722
NaiveBayes          80.269058  74.074074  72.289157  73.170732
SVM                 84.304933  69.135802  84.848485  76.190476
KNN                 81.165919  70.370370  76.000000  73.076923
AdaBoost            82.511211  74.074074  76.923077  75.471698
XGBoost             81.614350  67.901235  78.571429  72.847682


HBox(children=(IntProgress(value=0, description='Optimization Progress', max=120, style=ProgressStyle(descript…

Generation 1 - Current best internal CV score: 0.8158861498688678
Generation 2 - Current best internal CV score: 0.8158861498688678
Generation 3 - Current best internal CV score: 0.8158861498688678
Generation 4 - Current best internal CV score: 0.8158861498688678
Generation 5 - Current best internal CV score: 0.8218454095437598

Best pipeline: XGBClassifier(input_matrix, learning_rate=0.1, max_depth=7, min_child_weight=2, n_estimators=100, nthread=1, subsample=0.7500000000000001)
     PassengerId  Pclass                                          Name  \
0            892       3                              Kelly, Mr. James   
1            893       3              Wilkes, Mrs. James (Ellen Needs)   
2            894       2                     Myles, Mr. Thomas Francis   
3            895       3                              Wirz, Mr. Albert   
4            896       3  Hirvonen, Mrs. Alexander (Helga E Lindqvist)   
..           ...     ...                                           ... 