In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

# データの読み込み, 表示

In [2]:
train = pd.read_csv("../input/titanic/train.csv")
test = pd.read_csv("../input/titanic/test.csv")

In [3]:
train

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [4]:
test

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


# データの準備, 連結

### 訓練データとテストデータをまとめる

In [5]:
df = pd.concat([train, test], sort=False, axis="rows")

In [6]:
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
413,1305,,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


# 欠損値を確認

In [7]:
df.isnull().sum()

PassengerId       0
Survived        418
Pclass            0
Name              0
Sex               0
Age             263
SibSp             0
Parch             0
Ticket            0
Fare              1
Cabin          1014
Embarked          2
dtype: int64

### 生存率, 年齢, 客室番号, 出港地 に欠損値が存在する

# 型の確認

In [8]:
df.dtypes

PassengerId      int64
Survived       float64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

### 名前, 性別, チケット, 客室番号, 出港地 は数値型へ変換する

# 名前から敬称を取り出し変換, 特徴量として扱う

In [9]:
df["Salutation"] = df["Name"].str.extract("([A-Za-z]+)\.", expand=False)                                  # pandas の文字列を正規表現で分割

df["Salutation"].replace(["Mme", "Ms"], "Mrs", inplace=True)                                              # Mrs に変換
df["Salutation"].replace("Mlle", "Miss", inplace=True)                                                    # Miss に変換
df["Salutation"].replace(["Capt", "Col", "Dr", "Major", "Rev"], "Officer", inplace=True)                  # Officer カテゴリにまとめる
df["Salutation"].replace(["Countess", "Don", "Dona", "Jonkheer", "Lady", "Sir"], "Royalty", inplace=True) # Royalty カテゴリにまとめる

# 名前の型変換

In [10]:
df.loc[df["Name"].str.contains("Mr. ") == True, "Name"] = 0
df.loc[df["Name"].str.contains("Miss. ") == True, "Name"] = 1
df.loc[df["Name"].str.contains("Mrs. ") == True, "Name"] = 2
df.loc[df["Name"].str.contains("Master. ") == True, "Name"] = 3
df.loc[df["Name"].str.contains("Dr. ") == True, "Name"] = 3
df.loc[df["Name"].str.contains("Rev. ") == True, "Name"] = 4
df.loc[df["Name"].str.contains("Col. ") == True, "Name"] = 5
df.loc[df["Name"].str.contains("Major. ") == True, "Name"] = 6
df.loc[df["Name"].str.contains("Jonkheer. ") == True, "Name"] = 7
df.loc[df["Name"].str.contains("Mme. ") == True, "Name"] = 8
df.loc[df["Name"].str.contains("Capt. ") == True, "Name"] = 9
df.loc[df["Name"].str.contains("Ms. ") == True, "Name"] = 10
df.loc[df["Name"].str.contains("Mlle. ") == True, "Name"] = 11
df.loc[df["Name"].str.contains("Don. ") == True, "Name"] = 12
df.loc[df["Name"].str.contains("Countess. ") == True, "Name"] = 13
df.loc[df["Name"].str.contains("Sir. ") == True, "Name"] = 14
df.loc[df["Name"].str.contains("Dona. ") == True, "Name"] = 15

df["Name"].value_counts()

0     761
1     258
2     197
3      69
4       8
5       4
10      2
6       2
15      1
14      1
13      1
12      1
11      1
9       1
8       1
7       1
Name: Name, dtype: int64

# 性別の型変換

In [11]:
df["Sex"] = df["Sex"].replace({"male":0, "female":1})

df["Sex"].value_counts()

0    843
1    466
Name: Sex, dtype: int64

# チケットの型変換

### 複雑なため, カラムを削除する方針とする.

# 客室番号の型変換

### 欠損値は0で補完

In [12]:
df["Cabin"] = df["Cabin"].fillna(0)
df.loc[df["Cabin"].str.contains("C") == True, "Cabin"] = 1
df.loc[df["Cabin"].str.contains("B") == True, "Cabin"] = 2
df.loc[df["Cabin"].str.contains("D") == True, "Cabin"] = 3
df.loc[df["Cabin"].str.contains("E") == True, "Cabin"] = 4
df.loc[df["Cabin"].str.contains("A") == True, "Cabin"] = 5
df.loc[df["Cabin"].str.contains("F") == True, "Cabin"] = 6
df.loc[df["Cabin"].str.contains("G") == True, "Cabin"] = 7
df.loc[df["Cabin"].str.contains("T") == True, "Cabin"] = 8

df["Cabin"].value_counts()

0    1014
1      94
2      65
3      46
4      44
5      22
6      18
7       5
8       1
Name: Cabin, dtype: int64

# 出港地の型変換

### 欠損値は最頻値 S で補完

In [13]:
df["Embarked"] = df["Embarked"].fillna("S")

df["Embarked"].value_counts()

S    916
C    270
Q    123
Name: Embarked, dtype: int64

In [14]:
df.loc[df["Embarked"] == "S", "Embarked"] = 0
df.loc[df["Embarked"] == "C", "Embarked"] = 1
df.loc[df["Embarked"] == "Q", "Embarked"] = 2

df["Embarked"].value_counts()

0    916
1    270
2    123
Name: Embarked, dtype: int64

# 年齢の欠損値を補完

### 年齢予測用モデルを作成し, 欠損しているデータの年齢をランダムフォレストにより予測

In [15]:
df_age = df.loc[:, ["Age", "Sex", "SibSp", "Parch", "Salutation"]]
df_age = pd.get_dummies(df_age, columns=["Sex", "Salutation"])

df_age_notnull = df_age[df_age["Age"].notnull()] # 学習データ
df_age_null = df_age[df_age["Age"].isnull()]     # 欠損値補完が必要な学習データ

X = df_age_notnull.iloc[:, 1:]                   # 説明変数
y = df_age_notnull.iloc[:, 0]                    # 目的変数

pipeline = Pipeline([("scl", StandardScaler()),  # 標準化を Pipeline により置き換え
                    ("est", RandomForestRegressor(random_state=0))])
pipeline.fit(X, y)
age_predicted = pipeline.predict(df_age_null.iloc[:, 1:])

df.loc[(df["Age"].isnull()), "Age"] = age_predicted

# 料金の欠損値を補完

### 欠損値は0で補完

In [16]:
df["Fare"] = df["Fare"].fillna(0)

# 訓練データとテストデータを分割

In [17]:
train = df.iloc[:891]
train

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Salutation
0,1,0.0,3,0,0,22.000000,1,0,A/5 21171,7.2500,0,0,Mr
1,2,1.0,1,2,1,38.000000,1,0,PC 17599,71.2833,1,1,Mrs
2,3,1.0,3,1,1,26.000000,0,0,STON/O2. 3101282,7.9250,0,0,Miss
3,4,1.0,1,2,1,35.000000,1,0,113803,53.1000,1,0,Mrs
4,5,0.0,3,0,0,35.000000,0,0,373450,8.0500,0,0,Mr
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0.0,2,4,0,27.000000,0,0,211536,13.0000,0,0,Officer
887,888,1.0,1,1,1,19.000000,0,0,112053,30.0000,2,0,Miss
888,889,0.0,3,1,1,7.960018,1,2,W./C. 6607,23.4500,0,0,Miss
889,890,1.0,1,0,0,26.000000,0,0,111369,30.0000,1,1,Mr


In [18]:
X_test = df.iloc[891:]
X_test = X_test.drop(["Survived", "Ticket", "Salutation"], axis="columns")
X_test

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,892,3,0,0,34.500000,0,0,7.8292,0,2
1,893,3,2,1,47.000000,1,0,7.0000,0,0
2,894,2,0,0,62.000000,0,0,9.6875,0,2
3,895,3,0,0,27.000000,0,0,8.6625,0,0
4,896,3,2,1,22.000000,1,1,12.2875,0,0
...,...,...,...,...,...,...,...,...,...,...
413,1305,3,0,0,31.639415,0,0,8.0500,0,0
414,1306,1,15,1,39.000000,0,0,108.9000,1,1
415,1307,3,0,0,38.500000,0,0,7.2500,0,0
416,1308,3,0,0,31.639415,0,0,8.0500,0,0


# 説明変数, 目的変数を定義

In [19]:
X_train = train.drop(["Survived", "Ticket", "Salutation"], axis="columns")
y_train = train["Survived"].astype(int)

print(X_train.head())
print(y_train.head())

   PassengerId  Pclass Name  Sex   Age  SibSp  Parch     Fare Cabin Embarked
0            1       3    0    0  22.0      1      0   7.2500     0        0
1            2       1    2    1  38.0      1      0  71.2833     1        1
2            3       3    1    1  26.0      0      0   7.9250     0        0
3            4       1    2    1  35.0      1      0  53.1000     1        0
4            5       3    0    0  35.0      0      0   8.0500     0        0
0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64


# ランダムフォレストを用いた訓練データの学習

In [20]:
model = RandomForestClassifier(n_estimators=200, random_state=71)
model.fit(X_train, y_train)

RandomForestClassifier(n_estimators=200, random_state=71)

# 定義したモデルを予測

In [21]:
y_pred = model.predict(X_test)

# 正解ラベルを読み込む

In [22]:
y_true = pd.read_csv("../input/titanic/gender_submission.csv", index_col="PassengerId")
y_true

Unnamed: 0_level_0,Survived
PassengerId,Unnamed: 1_level_1
892,0
893,1
894,0
895,0
896,1
...,...
1305,0
1306,1
1307,0
1308,0


# 予測結果

In [23]:
print("正答率： {}".format(accuracy_score(y_true=y_true, y_pred=y_pred)))

正答率： 0.8229665071770335


# ファイルの出力

### PassengerId と 予測結果を結合し提出用の CSV ファイルとして出力する

In [24]:
submission = pd.DataFrame({"PassengerId":X_test["PassengerId"], "Survived":y_pred})
submission.to_csv("submission.csv", index=False)