# 参考資料
**書籍**


**Kaggle**


**自分で作成したファイル**


**GCI**
- 0811_Mayumi_Nakano.ipynb

## ライブラリ・データセットのインポート

In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder

In [2]:
FILE_PATH = ''

train = pd.read_csv(FILE_PATH + 'train.csv')
test = pd.read_csv(FILE_PATH + 'test.csv')

In [3]:
test['Survived'] = np.nan
df = pd.concat([train, test])
df.reset_index(drop=True)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
1304,1305,,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
1305,1306,,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
1306,1307,,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
1307,1308,,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


## 前処理

### 欠損値補完

欠損値のあるカラム
* train, test: 'Cabin', 'Age'
* train: 'Embarked'
* test: 'Fare'

* Age: Sex, Pclassでグループ分けして、それの**中央値**で補完
* Fare: (引用: https://3pysci.com/kaggle-titanic-8/)
* Cabin: Missingを表す**"M"**で補完。後の特徴量エンジニアリングに用いる。
* Embarked: **最頻値**である**S**で埋める

In [4]:
# Age
medians = df.groupby(['Sex', 'Pclass'])['Age'].median()
df['Age'] = df.groupby(['Sex', 'Pclass'])['Age'].transform(lambda x: x.fillna(x.median()))

# Fare
df["Fare"] = df["Fare"].fillna(float(7.8875))

# Cabin
df['Cabin'] = df["Cabin"].fillna("M")

# Embarked
df["Embarked"] = df["Embarked"].fillna("S")

## 特徴量エンジニアリング

* 特徴量の追加
    - 家族に関する特徴量を追加
        - FamilySize を追加
        - Family_Survival（家族の死亡率） を追加

* 値のグループ化
    - Ticket のグループ化
    - Name（敬称, 苗字） のグループ化
    - Cabin のグループ化
    - Fare(+ラベルエンコーディング)
    - Age(+ラベルエンコーディング)

* ラベルエンコーディング
    - Embarked
    - Sex
    - Fare(+値のグループ化)
    - Age(+値のグループ化)

### 特徴量の追加

#### 家族に関する特徴量を追加

In [5]:
df["FamilySize"] = df["Parch"] + df["SibSp"] + 1

# （Family_Survival列作成のコードに関して引用: https://www.kaggle.com/code/konstantinmasich/titanic-0-82-0-83）
# 名字を表す列Last_Nameを取得
df['Last_Name'] = df['Name'].apply(lambda x: str.split(x, ",")[0])
# 家族の死亡率を表す列Family_Survivalをデフォルトで作成。（0:生存, 1:死亡）
DEFAULT_SURVIVAL_VALUE = 0.5
df['Family_Survival'] = DEFAULT_SURVIVAL_VALUE
# Last_NameとFareでdfの特定列をグループ化。Last_NameとFareが一致していたら同じ家族である可能性が高い。grpはキーでgrp_dfはそのグループに属するdataframe
for grp, grp_df in df[['Survived','Name', 'Last_Name', 'Fare', 'Ticket', 'PassengerId', 'SibSp', 'Parch', 'Age', 'Cabin']].groupby(['Last_Name', 'Fare']):
    # もしグループ内に1人以上いる場合
    if (len(grp_df) != 1):
        # 特定の家族グループ内の各乗客に対してループを開始
        for ind, row in grp_df.iterrows():
            # 自分以外の家族メンバーの乗客の死亡率
            smax = grp_df.drop(ind)['Survived'].max()
            smin = grp_df.drop(ind)['Survived'].min()
            passID = row['PassengerId']
            # もし家族内で少なくとも1人死んでいたらFmaily_Survivalを1にする
            if (smax == 1.0):
                df.loc[df['PassengerId'] == passID, 'Family_Survival'] = 1
            elif (smin==0.0):
                df.loc[df['PassengerId'] == passID, 'Family_Survival'] = 0

print("Number of passengers with family survival information:",
      df.loc[df['Family_Survival']!=0.5].shape[0])

Number of passengers with family survival information: 420


#### Name（敬称, 苗字） に関する特徴量

In [6]:
# Nameから敬称を取り出し、Salutationとして新たに追加
title_list = []
def extract_title(name):
    for name_split in name.split(): # 名前を分割
        if name_split.endswith("."): # もし分割した名前の終わりがピリオドで
            if name_split != "L.": #もし分割した名前が"L."でなかったら
                title_list.append(name_split)
                return name_split  # 敬称を返す
    return "" # 敬称が見つからなかった場合、空文字を返す
df['Salutation'] = df['Name'].apply(extract_title)

# 各敬称(Salutation)を数字でグループ分けしてhonor列に新たに追加（引用: https://banga-heavy.com/kaggle%E3%82%BF%E3%82%A4%E3%82%BF%E3%83%8B%E3%83%83%E3%82%AF%E3%83%87%E3%83%BC%E3%82%BF%E3%81%A781-100-lightgbmxoptunax%E4%BA%A4%E5%B7%AE%E6%A4%9C%E8%A8%BC/）
df["honor"] = df["Salutation"].copy()
df["honor"]=df["honor"].replace(("Don.","Rev.","Jonkheer.","Capt."),1)
df["honor"]=df["honor"].replace(("Mr."),2)
df["honor"]=df["honor"].replace(("Master.","Dr.","Major.","Col."),3)
df["honor"]=df["honor"].replace(("Miss.",),4)
df["honor"]=df["honor"].replace(("Mrs.","Mme.","Ms.","Lady.","Sir.","Mlle.","Countess.","Dona."),5)

# Nameの頭文字と長さを取得し、Nameini, Namelenとして新たに追加。Nameiniはラベルエンコーディング（引用: https://banga-heavy.com/kaggle%E3%82%BF%E3%82%A4%E3%82%BF%E3%83%8B%E3%83%83%E3%82%AF%E3%83%87%E3%83%BC%E3%82%BF%E3%81%A781-100-lightgbmxoptunax%E4%BA%A4%E5%B7%AE%E6%A4%9C%E8%A8%BC/）
df["Nameini"]=df["Name"].map(lambda x: str(x)[0])
df["Nameini"]=LabelEncoder().fit_transform(df["Nameini"])
df["Namelen"]=df["Name"].map(lambda x: len(str(x)))

### 値のグループ化

#### Ticket

In [7]:
# Ticketの頭文字(Ticketini)と文字列の長さ(Ticketlen)を追加 (参考: https://lp-tech.net/articles/0QUUd?page=2)
df['Ticketini'] = df['Ticket'].apply(lambda x: str(x)[0])
# 死んでいる確率が低いものから順に0, 1, 2と番号付け
df['Ticketini'] = np.where((df['Ticketini']).isin(['1', '2', "9", "C", "F", 'P', 'S']), df['Ticketini'], np.where((df['Ticketini']).isin(["3", '4', '5', '6', '7',"8", "A", 'L','W']), 2,2))
df['Ticketini'] = df['Ticketini'].replace("1",0).replace("2",1).replace("9",0).replace("C",1).replace("F",0).replace("P",0).replace("S",1)
df["Ticketlen"] = df["Ticket"].map(lambda x: len(str(x)))

# dfをTicketでグループ分け
for _, grp_df in df.groupby('Ticket'):
    # グループに1人以上の乗客がいる時（チケットが同じなら同じCabin/ 家族?）
    if (len(grp_df) != 1):
        for ind, row in grp_df.iterrows():
            # Family_Survival の値が 0（誰かが生きている）または 0.5（デフォルト値）であるとき
            if (row['Family_Survival'] == 0) | (row['Family_Survival']== 0.5):
                # 自分以外の家族メンバーの乗客の死亡率
                smax = grp_df.drop(ind)['Survived'].max()
                smin = grp_df.drop(ind)['Survived'].min()
                passID = row['PassengerId']
                # もし家族内で少なくとも1人死んでいたらFmaily_Survivalを1にする
                if (smax == 1.0):
                    df.loc[df['PassengerId'] == passID, 'Family_Survival'] = 1
                elif (smin==0.0):
                    df.loc[df['PassengerId'] == passID, 'Family_Survival'] = 0
print("Number of passenger with family/group survival information: " +str(df[df['Family_Survival']!=0.5].shape[0]))

Number of passenger with family/group survival information: 546


#### Cabin

In [8]:
# Cabinの頭文字を取得 (Mはもともと欠損していたことを表す。)
df["Cabin_init"] = df["Cabin"].map(lambda x:str(x)[0])
# Cabinの頭文字がTの人はPclass = 1だったので、同様にPclass = 1であるAに置き換え。
df.loc[df["Cabin_init"] == "T", "Cabin_init"] = "A"
# 死ぬ確率が同等だったものでグループ化
df.loc[df["Cabin_init"].isin(["A", "C"]), "Cabin_init"] = "AC"
df.loc[df["Cabin_init"].isin(["F", "G"]), "Cabin_init"] = "FG"
df.loc[df["Cabin_init"].isin(["B", "D", "E"]), "Cabin_init"] = "BDE"
# 死ぬ確率が低かったものから順に番号付け
df['Cabin_init'] = df['Cabin_init'].replace("BDE",0).replace("AC",1).replace("FG",2).replace("M",3)

#### Fare, Age

In [9]:
# Fare
df['FareBin'] = pd.qcut(df['Fare'], 13)

# Age
df['AgeBin'] = pd.qcut(df['Age'], 10)

### ラベルエンコーディング

titanic_f_stkz_e_ai.ipynb

In [10]:
label_encoder = LabelEncoder()
label_encoder_cols = ["Sex", "Embarked", "Fare", "Age"]
df[label_encoder_cols] = df[label_encoder_cols].apply(label_encoder.fit_transform)

In [11]:
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,...,Family_Survival,Salutation,honor,Nameini,Namelen,Ticketini,Ticketlen,Cabin_init,FareBin,AgeBin
0,1,0.0,3,"Braund, Mr. Owen Harris",1,32,1,0,A/5 21171,20,...,0.5,Mr.,2,1,23,2,9,3,"(-0.001, 7.25]","(21.0, 22.0]"
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,58,1,0,PC 17599,237,...,0.5,Mrs.,5,2,51,0,8,1,"(56.496, 83.475]","(34.0, 40.0]"
2,3,1.0,3,"Heikkinen, Miss. Laina",0,39,0,0,STON/O2. 3101282,50,...,0.5,Miss.,4,7,22,1,16,3,"(7.896, 8.05]","(25.0, 26.0]"
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,54,1,0,113803,217,...,0.0,Mrs.,5,5,44,0,6,1,"(34.075, 56.496]","(34.0, 40.0]"
4,5,0.0,3,"Allen, Mr. William Henry",1,54,0,0,373450,52,...,0.5,Mr.,2,0,24,2,6,3,"(7.896, 8.05]","(34.0, 40.0]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,1305,,3,"Spector, Mr. Woolf",1,38,0,0,A.5. 3236,52,...,0.5,Mr.,2,18,18,2,9,3,"(7.896, 8.05]","(22.0, 25.0]"
414,1306,,1,"Oliva y Ocana, Dona. Fermina",0,60,0,0,PC 17758,261,...,1.0,Dona.,5,14,28,0,8,1,"(83.475, 512.329]","(34.0, 40.0]"
415,1307,,3,"Saether, Mr. Simon Sivertsen",1,59,0,0,SOTON/O.Q. 3101262,20,...,0.5,Mr.,2,18,28,1,18,3,"(-0.001, 7.25]","(34.0, 40.0]"
416,1308,,3,"Ware, Mr. Frederick",1,38,0,0,359309,52,...,0.5,Mr.,2,22,19,2,6,3,"(7.896, 8.05]","(22.0, 25.0]"


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 417
Data columns (total 24 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   PassengerId      1309 non-null   int64   
 1   Survived         891 non-null    float64 
 2   Pclass           1309 non-null   int64   
 3   Name             1309 non-null   object  
 4   Sex              1309 non-null   int64   
 5   Age              1309 non-null   int64   
 6   SibSp            1309 non-null   int64   
 7   Parch            1309 non-null   int64   
 8   Ticket           1309 non-null   object  
 9   Fare             1309 non-null   int64   
 10  Cabin            1309 non-null   object  
 11  Embarked         1309 non-null   int64   
 12  FamilySize       1309 non-null   int64   
 13  Last_Name        1309 non-null   object  
 14  Family_Survival  1309 non-null   float64 
 15  Salutation       1309 non-null   object  
 16  honor            1309 non-null   int64   
 

In [13]:
train = df[:891]
test = df[891:]
# cols = df.columns.drop(['PassengerId', 'Survived', 'Name', 'Age', 'Ticket', 'Cabin', 'Last_Name', 'Salutation'])
cols=['Pclass', 'SibSp','Parch',
       'Embarked', 'Sex', 'FamilySize',
       'Ticketini', 'Ticketlen', 'honor',
       'Cabin_init', 'Namelen', 'Nameini',
       'Family_Survival', 'FareBin', 'AgeBin']
x = train[cols]
t = train.iloc[:, 1:2]
test = test[cols]
print(x.columns)

Index(['Pclass', 'SibSp', 'Parch', 'Embarked', 'Sex', 'FamilySize',
       'Ticketini', 'Ticketlen', 'honor', 'Cabin_init', 'Namelen', 'Nameini',
       'Family_Survival', 'FareBin', 'AgeBin'],
      dtype='object')


In [14]:
# のちのモデル学習で用いるために.values変換
X_values = x.values
y_values=t.values