# 泰坦尼克号生存预测
* 通过观察数据特征，预测哪些人可以生存下来

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets
%matplotlib inline

## 1、观察数据集

In [3]:
data_train = pd.read_csv("data/titanic/train.csv")
# 打印信息
data_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 66.2+ KB


In [4]:
# 数据缺失程度
data_train.groupby('Survived').count()

Unnamed: 0_level_0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
Survived,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,549,549,549,549,424,549,549,549,549,68,549
1,342,342,342,342,290,342,342,342,342,136,340


In [6]:
data_train.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## 1.1、处理数据缺失问题
* 处理数值型:
      * 扔掉缺失数据
      * 按某个统计量(平均值、中位数、定值等)补全
      * 拿模型预测缺失值
* 处理类别/文本型:
      * 做转换处理

In [34]:
# 补全Age列缺失问题
def set_missing_ages(p_df):
    p_df.loc[(p_df.Age.isnull()),'Age'] =  p_df.Age.dropna().mean()
    return p_df

df = set_missing_ages(data_train)

In [40]:
# 归一化数据 年龄和费用一列
# 定义去均值和归一化函数
def meanAndStd(arr):
    return (arr-np.mean(arr))/np.std(arr)

import sklearn.preprocessing as preprocessing
scaler = preprocessing.StandardScaler()
scaler.fit_transform()
# ['Age']-pd['Age'].std()
# data_train['Fare_scaled'] = scaler.fit_transform(data_train['Fare'])

array([[-1.],
       [ 1.]])

In [47]:
# 处理类别/文本型
df.loc[(pd.Cabin.notnull()),'Cabin'] = 'Yes'
df.loc[(pd.Cabin.isnull()),'Cabin'] = 'No'

train_df = df.filter(regex='Survived|Age*|SibSp|Parch|Fare')
train_df

Unnamed: 0,Survived,Age,SibSp,Parch,Fare
0,0,22.000000,1,0,7.2500
1,1,38.000000,1,0,71.2833
2,1,26.000000,0,0,7.9250
3,1,35.000000,1,0,53.1000
4,0,35.000000,0,0,8.0500
5,0,29.699118,0,0,8.4583
6,0,54.000000,0,0,51.8625
7,0,2.000000,3,1,21.0750
8,1,27.000000,0,2,11.1333
9,1,14.000000,1,0,30.0708


## 2、使用模型

In [51]:
Y = train_df.Survived
X = train_df[['Age','SibSp','Parch','Fare']]

from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(C=1000.0, random_state=0)
lr.fit(X, Y)

LogisticRegression(C=1000.0, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=0,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

## 3、预测

In [54]:
lr.predict([[22.000000,1,0,7.2500],[38.000000,1,0,71.2833],[58.000000,0,0,26.5500]])

array([0, 0, 0], dtype=int64)