In [None]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()
import numpy as np
import re

from sklearn import tree
from sklearn.linear_model import LogisticRegression

### 將檔案存為Pandas DataFrame

In [None]:
dfTrain=pd.read_csv("../datasets/titanic/titanic_train.csv") # 訓練資料
dfTest=pd.read_csv("../datasets/titanic/titanic_test.csv")   # 測試資料

In [None]:
dfTrain.info()

### 看每個欄位是否有重複值？

In [None]:
dfTrain.shape

In [None]:
dfTrain.apply(lambda x:x.unique().shape[0],axis=0)

In [None]:
set(dfTrain["Pclass"])

In [None]:
dfTrain.shape

In [None]:
(dfTrain.apply(lambda x:x.unique().shape[0],axis=0)/dfTrain.shape[0]).plot(kind='bar',rot=45)

* 上圖中，若欄位所對應的y值小，則代表該欄位的值有高度重複的現象。也就是說，該欄位可能為類別型變數。而若y值=1,則代表該欄位無重複值，有可能為索引或是連續型變數。

### 看欄位是否有空值？

In [None]:
dfTrain.isnull().any()

In [None]:
dfTrain.isnull().sum().plot(kind='bar',rot=45,title='number of missing values')

由上圖得知，Age, Cabin和Embarked這三個欄位含有空值。

In [None]:
cmap=sns.light_palette("navy", reverse=False)
sns.heatmap(dfTrain.isnull().astype(np.int8),yticklabels=False,cmap=cmap)

### 探究：性別(Sex), 艙等(Pclass)和年齡(Age)，是否會影響生還與否(Survived)？

In [None]:
def trans(x):
    if x<=12:
        return "children"
    elif x>12:
        return "non_children"
    else:
        return np.NaN

dfTrain["AgeInfo"]=dfTrain["Age"].apply(trans)
dfTest["AgeInfo"]=dfTest["Age"].apply(trans)

In [None]:
dfTmp=dfTrain.groupby(["Pclass","Sex"])["Survived"].agg([np.mean,np.std,np.sum,len])
dfTmp=dfTmp.reset_index()
dfTmp

In [None]:
fig,axes=plt.subplots(1,3,figsize=(10,3),sharey=True)
groups=dfTmp.groupby("Pclass")

for idx,(name,group) in enumerate(groups):
    axes[idx].bar(x=group["Sex"],height=group["mean"],
                  color=["darkgreen","darkblue"])
    axes[idx].set_title("Pclass = %i"%name)

* 無論何種艙等，女性生還率皆高於男性至少一倍以上。

利用Seaborn，可簡單的執行一行指令即得到上圖：

In [None]:
sns.catplot(data=dfTrain,col="Pclass",x="Sex",y="Survived",kind="bar")

In [None]:
g=sns.catplot(data=dfTrain,col="Pclass",x="Sex",y="Survived",kind="bar")

In [None]:
g=sns.catplot(data=dfTrain,col="Pclass",x="Sex",hue="AgeInfo",y="Survived",kind="bar")

In [None]:
g=sns.countplot("Pclass",hue="Sex",data=dfTrain)

In [None]:
g=sns.countplot("Pclass",hue="AgeInfo",data=dfTrain)

In [None]:
dfTrain["famSize"]=dfTrain["SibSp"]+dfTrain["Parch"]
dfTest["famSize"]=dfTest["SibSp"]+dfTest["Parch"]

In [None]:
g=sns.countplot("Pclass",hue="famSize",data=dfTrain)

* 三等艙單身的人多，也相較於其他艙等，比較有大一些的家庭。

In [None]:
ax=dfTrain[["famSize","Survived"]].groupby("famSize").count().plot(kind="bar")

* 單身一人，沒有家庭的人佔大多數。有超過兩個親人的人不多。

In [None]:
g=sns.catplot(x="famSize",y="Survived",data=dfTrain,kind="bar",ci=None)
g.set_ylabels("Survival Rate")
g.set_xlabels("Family Size")

* 小家庭(1-3人)較容易生還。

In [None]:
g=sns.catplot(x="famSize",y="Survived",hue='Sex',data=dfTrain,kind="bar",ci=None)
g.set_ylabels("Survival Rate")
g.set_xlabels("Family Size")

* 家室數量$\leq 3$時，男性生還率與家室數量成正比。

In [None]:
g=sns.catplot(x="famSize",y="Survived",hue='AgeInfo',
                 data=dfTrain[["famSize","Survived","AgeInfo"]].dropna(how="any"),
                 kind="bar",ci=None)
g.set_ylabels("Survival Rate")
g.set_xlabels("Family Size")

* 小孩生還率較非小孩高。

    但家庭太大則不一定。不過，家庭大時，小孩樣本數很少，所以也許沒有參考性。

In [None]:
g=sns.countplot("famSize",hue='AgeInfo',
                data=dfTrain[["famSize","AgeInfo"]].dropna(how="any"))

---

### 座位(Cabin)

In [None]:
print("座艙資料筆數=\t", len( dfTrain["Cabin"] ) )
print("座艙空值數=\t",dfTrain["Cabin"].isnull().sum() )

In [None]:
dfTrain["Cabin"].unique()

座位號碼太多，目前我想要只保留字母。其實也許數字大小也有用，之後或可考慮利用數字大小。

In [None]:
def extractCabinLabel(name):
    try:
        matched=re.search("([A-z])(.*)",name)
        label=matched.groups()[0]
    except:
        label=np.NaN
    return label

dfTrain["Cabin"]=dfTrain["Cabin"].apply(extractCabinLabel)

In [None]:
print( dfTrain["Cabin"].unique()    )
print( dfTrain["Embarked"].unique() )

In [None]:
groups=dfTrain[["Cabin","Embarked"] ].groupby("Embarked")
for name,group in groups:
    print(name,group["Cabin"].isnull().sum())

* 很多從S港口登陸的人，我們不確定他們坐在什麼位置。

### 探究座位(Cabin)是否影響生還與否(Survived)

我們要問的是，是否座位是影響生還率的factor(因子)之一。故，以下使用Seaborn內建的sns.factorplot()來探究:

In [None]:
sns.catplot(x="Cabin",y="Survived",data=dfTrain[["Cabin","Survived"]].dropna(how="any"),
               kind="violin",order=["A","B","C","D","E","F","G","T"])

我們直接來計算每個座位區的生存率：

In [None]:
dfTrain[["Cabin","Survived"]].dropna(how="any").groupby("Cabin").mean().plot(kind="bar",rot=0)

In [None]:
g=sns.catplot(x="Cabin",y="Survived",hue="Pclass",
                 data=dfTrain[["Cabin","Survived","Pclass"]].dropna(how="any"),
                 kind="violin",
                 order=["A","B","C","D","E","F","G","T"],size=5,aspect=2)
g.fig.suptitle("Survived v.s. Cabin")
g.fig.subplots_adjust(top=0.9)

* 由上圖可見，座位順序由A至G移動時，艙等等級隨之下降。

---

### 探究生還與否(Survived)和其他連續變數的相依性(correlation)

In [None]:
corDf=dfTrain.corr()
corDf["Survived"]

In [None]:
corDf["Survived"].apply(lambda x:np.abs(x)).sort_values(ascending=False)

* 由上表可見，連續型變數中，與Survived較為相關的變數有Pclass, Fare。

Correlation可畫成熱圖：

In [None]:
plt.figure(figsize=(10, 10))

g=sns.heatmap(corDf, vmax=.8, linewidths=0.01,
            square=True,annot=True,cmap='YlGnBu',linecolor="white")
plt.title('Correlation between features');

---