In [147]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [148]:
# import dat
path = './data/titanic.csv'
data = pd.read_csv(path)

In [149]:
# data.head()
# data.info()
# data.descrive()

##  第一步：数据清理

In [150]:
# convert male to 0, female to 1
data['sex'] = data['sex'].apply(lambda x: 1 if x=='female' else 0)

# age column fill nan with median()
data['age'].fillna(data['age'].median(), inplace = True)

# add family columns
data['family'] = data['sibsp'] + data['parch']

# fare column fill nan with median()
data['fare'].fillna(data['fare'].median(), inplace = True)

# extract necessary features, ignoring unnucessary features
features = ['survived','pclass', 'sex', 'age', 'family', 'fare']
data = data[features]

x_train, x_test, y_train, y_test = train_test_split(data.drop(columns=["survived"]), data['survived'], test_size=0.3, shuffle=True)

In [151]:
x_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 393 entries, 67 to 546
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   pclass  393 non-null    int64  
 1   sex     393 non-null    int64  
 2   age     393 non-null    float64
 3   family  393 non-null    int64  
 4   fare    393 non-null    float64
dtypes: float64(2), int64(3)
memory usage: 18.4 KB


## 模型一：逻辑回归模型

In [152]:
from sklearn.linear_model import LogisticRegression

logistic_model = LogisticRegression()
logistic_model.fit(x_train, y_train)
predictions = logistic_model.predict(x_test)

In [153]:
true_predict, false_predict = (predictions == y_test).value_counts()
print(f'Logistic Model Accuracy: {true_predict/len(predictions)}')

Logistic Model Accuracy: 0.8040712468193384


## 模型二：支持向量机

In [162]:
from sklearn.svm import SVC

svm_model = SVC(degree=5)
svm_model.fit(x_train, y_train)
predictions = svm_model.predict(x_test)

In [163]:
true_predict, false_predict = (predictions == y_test).value_counts()
print(f'SVM Model Accuracy: {true_predict/len(predictions)}')

SVM Model Accuracy: 0.6641221374045801


## 模型三：决策树模型

In [156]:
from sklearn.tree import DecisionTreeClassifier

decision_tree_model = DecisionTreeClassifier(criterion='entropy', max_depth=3, random_state=1)
decision_tree_model.fit(x_train, y_train)
predictions = decision_tree_model.predict(x_test)

In [157]:
true_predict, false_predict = (predictions == y_test).value_counts()
print(f'Decision Tree Model Accuracy: {true_predict/len(predictions)}')

Decision Tree Model Accuracy: 0.811704834605598


## 模型四：随机森林模型

In [158]:
from sklearn.ensemble import RandomForestClassifier

random_forest_model = RandomForestClassifier(criterion='entropy', n_estimators=10, max_depth=5, random_state=1)
random_forest_model.fit(x_train, y_train)
predictions = random_forest_model.predict(x_test)

In [159]:
true_predict, false_predict = (predictions == y_test).value_counts()
print(f'Random Forest Model Accuracy: {true_predict/len(predictions)}')

Random Forest Model Accuracy: 0.8193384223918575


### 结论: 不同模型的预测准确率在65%-81%之间，，除了支持向量机 其余三个模型都可以达到80%的准确率