In [1]:
# pandas
import pandas as pd
from pandas import Series, DataFrame

In [2]:
# numpy, matplotlib, seaborn
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline

In [3]:
# machine learning
from sklearn.linear_model import LogisticRegression

In [4]:
train_df = pd.read_csv("./train.csv")
test_df = pd.read_csv("./test.csv")

In [5]:
# what does the data look like
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [6]:
test_df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [7]:
# Variable     Definition            Key
# survival     Survival              0 = No, 1 = Yes
# pclass       Ticket class          1 = 1st, 2 = 2nd, 3 = 3rd
# sex          Sex
# Age          Age in years
# sibsp        # of siblings / spouses aboard the Titanic
# parch        # of parents / children aboard the Titanic
# ticket       Ticket number
# fare         Passenger fare
# cabin        Cabin number
# embarked     Port of Embarkation   C = Cherbourg, Q = Queenstown, S = Southampton

In [8]:
# 移除那些我认为和预测没有关系的column
# embarked 从哪里登船我认为和预测是否生是无关的
train_df = train_df.drop(['Embarked'], axis=1)
test_df = test_df.drop(['Embarked'], axis=1)
# ticket number 票根的序列号
train_df = train_df.drop(['Ticket'], axis=1)
test_df = test_df.drop(['Ticket'], axis=1)

In [9]:
# 查看每个列中是否存在空值
train_df.isnull().any()

PassengerId    False
Survived       False
Pclass         False
Name           False
Sex            False
Age             True
SibSp          False
Parch          False
Fare           False
Cabin           True
dtype: bool

In [10]:
test_df.isnull().any()

PassengerId    False
Pclass         False
Name           False
Sex            False
Age             True
SibSp          False
Parch          False
Fare            True
Cabin           True
dtype: bool

# 对空值的处理

In [11]:
# age
# 如果存在空值，查看空值行占该列的比例
# age for training DataFrame
train_df[train_df['Age'].isnull().values == True].shape[0] / train_df.shape[0]

0.19865319865319866

In [12]:
# age fro test DataFrame
test_df[test_df['Age'].isnull().values == True].shape[0] / test_df.shape[0]

0.20574162679425836

In [13]:
# 由于age的空值在train_df和test_df中的比例都不是很高，所以我决定以随机值来填充age，使这一列没有空值
average_age_train = train_df["Age"].mean()
std_age_train = train_df["Age"].std()
count_nan_age_train = train_df["Age"].isnull().sum()
average_age_test    = test_df["Age"].mean()
std_age_test        = test_df["Age"].std()
count_nan_age_test  = test_df["Age"].isnull().sum()
rand_train = np.random.randint(average_age_train - std_age_train, average_age_train + std_age_train, 
                              size=count_nan_age_train)
rand_test = np.random.randint(average_age_test - std_age_test, average_age_test + std_age_test, 
                             size=count_nan_age_test)

In [14]:
train_df["Age"][np.isnan(train_df["Age"])] = rand_train
test_df["Age"][np.isnan(test_df["Age"])] = rand_test

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [15]:
# 这里比较填充之前和填充之后的年龄分布的过程被我省略，正常情况下应该去比较，如果分布类似，证明填充是有效的

In [None]:
# Cabin
#