In [1]:
import pandas as pd

##### 데이터 피처 설명
- pclass : Passenger Class, 승객 등급
- survived : 생존 여부
- name : 승객 이름
- sex : 승객 성별
- age : 승객 나이
- sibsp : 탑승 한 형제/배우자 수
- parch : 탑승 한 부모/자녀 수
- ticket : 티켓 번호
- fare : 승객 지불 요금
- cabin : 선실 이름
- embarked : 승선항 (C = 쉘 부르그, Q = 퀸즈타운, S = 사우스 햄튼)

### 데이터 로드

In [2]:
train = pd.read_csv("data/titanic_train1.csv", index_col="PassengerId")
train.shape

(891, 11)

In [3]:
train.head()
# train.tail()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


### Column 별 데이터 분석

###### 전체 생존률

In [4]:
# 전체 생존률
print(train["Survived"].mean() * 100)

survived_rate = train["Survived"].mean()
survived_rate = survived_rate * 100
print(f"생존률 = {survived_rate:.1f}%")

38.38383838383838
생존률 = 38.4%


In [5]:
# 0 : 사망자 총 인원수
# 1 : 생존자 총 인원수
train["Survived"].value_counts()

0    549
1    342
Name: Survived, dtype: int64

In [6]:
# 비율
train["Survived"].value_counts() / len(train["Survived"])

0    0.616162
1    0.383838
Name: Survived, dtype: float64

In [7]:
train["Survived"].head()

PassengerId
1    0
2    1
3    1
4    1
5    0
Name: Survived, dtype: int64

##### Survived 이름 지정

In [8]:
train.loc[train["Survived"] == 0, "Survived(Status)"] = "Perish"
train.loc[train["Survived"] == 1, "Survived(Status)"] = "Survived"

In [9]:
train[["Survived","Survived(Status)"]].head()

Unnamed: 0_level_0,Survived,Survived(Status)
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0,Perish
2,1,Survived
3,1,Survived
4,1,Survived
5,0,Perish


In [10]:
train["Survived(Status)"].value_counts()

Perish      549
Survived    342
Name: Survived(Status), dtype: int64

##### Pclass별 생존률

In [11]:
pd.pivot_table(data=train, index="Pclass", values="Survived")

Unnamed: 0_level_0,Survived
Pclass,Unnamed: 1_level_1
1,0.62963
2,0.472826
3,0.242363


In [12]:
# Pclass label 이름 지정
train.loc[train["Pclass"] == 1, "Pclass(Status)"] = "First Class"
train.loc[train["Pclass"] == 2, "Pclass(Status)"] = "Business"
train.loc[train["Pclass"] == 3, "Pclass(Status)"] = "Economy"

In [13]:
train[["Pclass","Pclass(Status)"]].head()

Unnamed: 0_level_0,Pclass,Pclass(Status)
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,3,Economy
2,1,First Class
3,3,Economy
4,1,First Class
5,3,Economy


In [14]:
# Pclass(Status)별 생존률
pd.pivot_table(data=train,index="Pclass(Status)",values="Survived")

Unnamed: 0_level_0,Survived
Pclass(Status),Unnamed: 1_level_1
Business,0.472826
Economy,0.242363
First Class,0.62963


##### Embarked 별 생존률

In [15]:
# Embarked 이름 정의
train.loc[train["Embarked"] == "S", "Embarked(Status)"] = "Southampton"
train.loc[train["Embarked"] == "C", "Embarked(Status)"] = "Cherbourg"
train.loc[train["Embarked"] == "Q", "Embarked(Status)"] = "Queenstown"

train[["Embarked","Embarked(Status)"]].head()

Unnamed: 0_level_0,Embarked,Embarked(Status)
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,S,Southampton
2,C,Cherbourg
3,S,Southampton
4,S,Southampton
5,S,Southampton


In [16]:
# Embarked(status) 별 생존률
pd.pivot_table(data=train, index="Embarked(Status)", values="Survived")

Unnamed: 0_level_0,Survived
Embarked(Status),Unnamed: 1_level_1
Cherbourg,0.553571
Queenstown,0.38961
Southampton,0.336957


##### 변수 Embarked : one hot encoding

In [17]:
train["Embarked(C)"] = train["Embarked"] == "C"
train["Embarked(S)"] = train["Embarked"] == "S"
train["Embarked(Q)"] = train["Embarked"] == "Q"
train[["Embarked", "Embarked(Status)", "Embarked(C)", "Embarked(S)", "Embarked(Q)"]].head(10)

Unnamed: 0_level_0,Embarked,Embarked(Status),Embarked(C),Embarked(S),Embarked(Q)
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,S,Southampton,False,True,False
2,C,Cherbourg,True,False,False
3,S,Southampton,False,True,False
4,S,Southampton,False,True,False
5,S,Southampton,False,True,False
6,Q,Queenstown,False,False,True
7,S,Southampton,False,True,False
8,S,Southampton,False,True,False
9,S,Southampton,False,True,False
10,C,Cherbourg,True,False,False


##### 변수 Pclass : one hot encoding

In [18]:
train["Pclass(F)"] = train["Pclass"] == 1
train["Pclass(B)"] = train["Pclass"] == 2
train["Pclass(E)"] = train["Pclass"] == 3
train[["Pclass", "Pclass(Status)", "Pclass(F)", "Pclass(B)", "Pclass(E)"]].head(10)

Unnamed: 0_level_0,Pclass,Pclass(Status),Pclass(F),Pclass(B),Pclass(E)
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,3,Economy,False,False,True
2,1,First Class,True,False,False
3,3,Economy,False,False,True
4,1,First Class,True,False,False
5,3,Economy,False,False,True
6,3,Economy,False,False,True
7,1,First Class,True,False,False
8,3,Economy,False,False,True
9,3,Economy,False,False,True
10,2,Business,False,True,False


##### 나이대 별 생존률

In [19]:
# Age 기술통계값
train["Age"].describe()

count    714.000000
mean      29.699118
std       14.526497
min        0.420000
25%       20.125000
50%       28.000000
75%       38.000000
max       80.000000
Name: Age, dtype: float64

In [20]:
# 각 Pclass의 객실 별 승객의 평균 나이
print(train[train["Pclass"] == 1]["Age"].mean())
print(train[train["Pclass"] == 2]["Age"].mean())
print(train[train["Pclass"] == 3]["Age"].mean())

38.233440860215055
29.87763005780347
25.14061971830986


In [21]:
# 또는
train.groupby(["Pclass"])["Age"].mean()

Pclass
1    38.233441
2    29.877630
3    25.140620
Name: Age, dtype: float64

In [22]:
# 나이대 생성
train.loc[train["Age"] < 15, "AgeType"] = "Young"
train.loc[(train["Age"] >= 15) & (train["Age"] < 30), "AgeType"] = "Medium"
train.loc[train["Age"] >= 30, "AgeType"] = "Old"

train[["Age", "AgeType"]].head(10)

Unnamed: 0_level_0,Age,AgeType
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,22.0,Medium
2,38.0,Old
3,26.0,Medium
4,35.0,Old
5,35.0,Old
6,,
7,54.0,Old
8,2.0,Young
9,27.0,Medium
10,14.0,Young


In [23]:
# 나이대 별 생존률
pd.pivot_table(data=train, index="AgeType", values="Survived")

Unnamed: 0_level_0,Survived
AgeType,Unnamed: 1_level_1
Medium,0.362745
Old,0.406061
Young,0.576923


In [24]:
# 또는
train.groupby("AgeType")["Survived"].mean()

AgeType
Medium    0.362745
Old       0.406061
Young     0.576923
Name: Survived, dtype: float64

In [25]:
# Age 빈칸과 그렇지 않은 칸 각각의 생존률
train.loc[train["AgeType"].isnull(), "AgeBlank"] = "Blank"
train.loc[train["AgeType"].notnull(), "AgeBlank"] = "Not Blank"

train[["Age","AgeBlank"]]

Unnamed: 0_level_0,Age,AgeBlank
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,22.0,Not Blank
2,38.0,Not Blank
3,26.0,Not Blank
4,35.0,Not Blank
5,35.0,Not Blank
...,...,...
887,27.0,Not Blank
888,19.0,Not Blank
889,,Blank
890,26.0,Not Blank


In [26]:
train.groupby("AgeBlank")["Survived"].mean()

AgeBlank
Blank        0.293785
Not Blank    0.406162
Name: Survived, dtype: float64

##### Age 결측치 채우기

In [27]:
# Pclass 등급 별 평균으로 대체
train.groupby("Pclass")["Age"].mean()

Pclass
1    38.233441
2    29.877630
3    25.140620
Name: Age, dtype: float64

In [28]:
train["Age(fill)"] = train["Age"]

In [29]:
train[["Pclass","Age","Age(fill)"]]

Unnamed: 0_level_0,Pclass,Age,Age(fill)
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,3,22.0,22.0
2,1,38.0,38.0
3,3,26.0,26.0
4,1,35.0,35.0
5,3,35.0,35.0
...,...,...,...
887,2,27.0,27.0
888,1,19.0,19.0
889,3,,
890,1,26.0,26.0


In [30]:
train.loc[(train["Age"].isnull()) & (train["Pclass"]==1), "Age(fill)"] = train[train["Pclass"] == 1]["Age"].mean()
train.loc[(train["Age"].isnull()) & (train["Pclass"]==2), "Age(fill)"] = train[train["Pclass"] == 2]["Age"].mean()
train.loc[(train["Age"].isnull()) & (train["Pclass"]==3), "Age(fill)"] = train[train["Pclass"] == 3]["Age"].mean()

train[["Pclass","Age","Age(fill)"]]

Unnamed: 0_level_0,Pclass,Age,Age(fill)
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,3,22.0,22.00000
2,1,38.0,38.00000
3,3,26.0,26.00000
4,1,35.0,35.00000
5,3,35.0,35.00000
...,...,...,...
887,2,27.0,27.00000
888,1,19.0,19.00000
889,3,,25.14062
890,1,26.0,26.00000


In [31]:
train.loc[train["Age"].isnull()][["Pclass","Age","Age(fill)"]]

Unnamed: 0_level_0,Pclass,Age,Age(fill)
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
6,3,,25.14062
18,2,,29.87763
20,3,,25.14062
27,3,,25.14062
29,3,,25.14062
...,...,...,...
860,3,,25.14062
864,3,,25.14062
869,3,,25.14062
879,3,,25.14062


In [32]:
# 또는
train['Age(fill)'] = train['Age']
train['Age(fill)'].fillna(train.groupby('Pclass')['Age(fill)'].transform('mean'), inplace=True)

train.groupby('Pclass')['Age(fill)'].transform("mean")

PassengerId
1      25.140620
2      38.233441
3      25.140620
4      38.233441
5      25.140620
         ...    
887    29.877630
888    38.233441
889    25.140620
890    38.233441
891    25.140620
Name: Age(fill), Length: 891, dtype: float64

In [33]:
train.loc[train["Age"].isnull(), ["Pclass","Age", "Age(fill)"]].head(10)

Unnamed: 0_level_0,Pclass,Age,Age(fill)
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
6,3,,25.14062
18,2,,29.87763
20,3,,25.14062
27,3,,25.14062
29,3,,25.14062
30,3,,25.14062
32,1,,38.233441
33,3,,25.14062
37,3,,25.14062
43,3,,25.14062


##### Single에 따른 생존률

In [34]:
# Single : Sibsp & Parch 모두 0일때
train["Single"] = (train["SibSp"] == 0) & (train["Parch"] == 0)

In [35]:
train[["SibSp","Parch","Single"]]

Unnamed: 0_level_0,SibSp,Parch,Single
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,0,False
2,1,0,False
3,0,0,True
4,1,0,False
5,0,0,True
...,...,...,...
887,0,0,True
888,0,0,True
889,1,2,False
890,0,0,True


In [36]:
# Single 에 따른 생존률
pd.pivot_table(data=train, index = "Single", values= "Survived")

Unnamed: 0_level_0,Survived
Single,Unnamed: 1_level_1
False,0.50565
True,0.303538


##### FamilySize 별 생존률

In [37]:
# FamilySize = Sibsp + Parch + 자기자신(1)
train["FamilySize"] = train["SibSp"] + train["Parch"] + 1

train[["SibSp","Parch","FamilySize"]]

Unnamed: 0_level_0,SibSp,Parch,FamilySize
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,0,2
2,1,0,2
3,0,0,1
4,1,0,2
5,0,0,1
...,...,...,...
887,0,0,1
888,0,0,1
889,1,2,4
890,0,0,1


In [38]:
# FamilySize 별 생존률
pd.pivot_table(data= train, index = "FamilySize", values = "Survived")

Unnamed: 0_level_0,Survived
FamilySize,Unnamed: 1_level_1
1,0.303538
2,0.552795
3,0.578431
4,0.724138
5,0.2
6,0.136364
7,0.333333
8,0.0
11,0.0


In [39]:
# FamilySize 구간 정의
train.loc[train["FamilySize"] == 1, "FamilyType"] = "Single"
train.loc[(train["FamilySize"] >= 2) & (train["FamilySize"] < 5) , "FamilyType"] = "Nuclear"
train.loc[train["FamilySize"] >= 5, "FamilyType"] = "Big"

train[["FamilySize","FamilyType"]].head(10)

Unnamed: 0_level_0,FamilySize,FamilyType
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,2,Nuclear
2,2,Nuclear
3,1,Single
4,2,Nuclear
5,1,Single
6,1,Single
7,1,Single
8,5,Big
9,3,Nuclear
10,2,Nuclear


In [40]:
# FamilySize 구간 별 생존률
pd.pivot_table(data= train, index = "FamilyType", values = "Survived")

Unnamed: 0_level_0,Survived
FamilyType,Unnamed: 1_level_1
Big,0.16129
Nuclear,0.578767
Single,0.303538


##### 변수 FamilySize : one hot encoding

In [41]:
train["FamilyType(Single)"] = train["FamilySize"] == 1
train["FamilyType(Nuclear)"] = (train["FamilySize"] >= 2) & (train["FamilySize"] <= 4)
train["FamilyType(Big)"] = train["FamilySize"] >= 5

train[["FamilySize","FamilyType(Single)","FamilyType(Nuclear)","FamilyType(Big)"]]

Unnamed: 0_level_0,FamilySize,FamilyType(Single),FamilyType(Nuclear),FamilyType(Big)
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,2,False,True,False
2,2,False,True,False
3,1,True,False,False
4,2,False,True,False
5,1,True,False,False
...,...,...,...,...
887,1,True,False,False
888,1,True,False,False
889,4,False,True,False
890,1,True,False,False


In [42]:
# 또는

train["FamilySize(Single)"] = train["FamilyType"] == "Single"
train["FamilySize(Nuclear)"] = train["FamilyType"] == "Nuclear"
train["FamilySize(Big)"] = train["FamilyType"] == "Big"

train[["FamilySize", "FamilyType(Single)", "FamilyType(Nuclear)", "FamilyType(Big)"]].head(10)

Unnamed: 0_level_0,FamilySize,FamilyType(Single),FamilyType(Nuclear),FamilyType(Big)
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,2,False,True,False
2,2,False,True,False
3,1,True,False,False
4,2,False,True,False
5,1,True,False,False
6,1,True,False,False
7,1,True,False,False
8,5,False,False,True
9,3,False,True,False
10,2,False,True,False


### 이름 분석 및 생존률
성(First name) **','** 호칭(Title) **'.'** 이름(SurName)  
ex) Braund, Mr. Owen Harris  
성 : Braund  
호칭 : Mr  
이름 : Owen Harris

In [43]:
name = "Braund, Mr. Owen Harris"
name.split(',')[1].split('.')[0]

' Mr'

In [44]:
# 호칭(Title) 색출하기위한 함수 정의
def get_title(name):
    return name.split(',')[1].split('.')[0]

In [45]:
train["Title"] = train["Name"].apply(get_title)

In [46]:
train[["Name", "Title"]].head(10)

Unnamed: 0_level_0,Name,Title
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,"Braund, Mr. Owen Harris",Mr
2,"Cumings, Mrs. John Bradley (Florence Briggs Th...",Mrs
3,"Heikkinen, Miss. Laina",Miss
4,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",Mrs
5,"Allen, Mr. William Henry",Mr
6,"Moran, Mr. James",Mr
7,"McCarthy, Mr. Timothy J",Mr
8,"Palsson, Master. Gosta Leonard",Master
9,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",Mrs
10,"Nasser, Mrs. Nicholas (Adele Achem)",Mrs


In [47]:
# 호칭별 생존률
pd.pivot_table(train, index="Title", values="Survived") 

Unnamed: 0_level_0,Survived
Title,Unnamed: 1_level_1
Capt,0.0
Col,0.5
Don,0.0
Dr,0.428571
Jonkheer,0.0
Lady,1.0
Major,0.5
Master,0.575
Miss,0.697802
Mlle,1.0


In [48]:
# 호칭별 인원 수
train["Title"].value_counts()

 Mr              517
 Miss            182
 Mrs             125
 Master           40
 Dr                7
 Rev               6
 Mlle              2
 Major             2
 Col               2
 the Countess      1
 Capt              1
 Ms                1
 Sir               1
 Lady              1
 Mme               1
 Don               1
 Jonkheer          1
Name: Title, dtype: int64

In [49]:
# 호칭 별 사망 생존 수
pd.crosstab(train["Title"], train["Survived"], margins=True)

Survived,0,1,All
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Capt,1,0,1
Col,1,1,2
Don,1,0,1
Dr,4,3,7
Jonkheer,1,0,1
Lady,0,1,1
Major,1,1,2
Master,17,23,40
Miss,55,127,182
Mlle,0,2,2


In [50]:
# Mr, Miss, Mrs, Master 을 뺀 나머지 값은 Others로 바꿈
def get_title_status(title):
    title = title.strip()
    if title == "Mr" or title == "Miss" or title == "Mrs" or title == "Master":
        return title
    else: 
        return "Others"

In [51]:
train["Title(Status)"] = train["Title"].apply(get_title_status)

train[["Name", "Title", "Title(Status)"]].tail()

Unnamed: 0_level_0,Name,Title,Title(Status)
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
887,"Montvila, Rev. Juozas",Rev,Others
888,"Graham, Miss. Margaret Edith",Miss,Miss
889,"Johnston, Miss. Catherine Helen ""Carrie""",Miss,Miss
890,"Behr, Mr. Karl Howell",Mr,Mr
891,"Dooley, Mr. Patrick",Mr,Mr


In [52]:
# strip() : 문자 앞 공백 삭제
train["Title"][1].strip() 

'Mr'

In [53]:
# 호칭 별 생존률
pd.pivot_table(data=train, index = "Title(Status)" , values = "Survived")

Unnamed: 0_level_0,Survived
Title(Status),Unnamed: 1_level_1
Master,0.575
Miss,0.697802
Mr,0.156673
Mrs,0.792
Others,0.444444


##### 기혼 미혼자 별 생존률

In [54]:
# 기혼자 : Mr, Mrs
# 미혼자 : Master, Miss
train["Title"] = train["Title"].str.strip()

train.loc[train["Title"] == "Mr", "Married"] = True
train.loc[train["Title"] == "Master", "Married"] = False
train.loc[train["Title"] == "Mrs", "Married"] = True
train.loc[train["Title"] == "Miss", "Married"] = False

In [55]:
train[["Name","Title","Married"]]

Unnamed: 0_level_0,Name,Title,Married
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,"Braund, Mr. Owen Harris",Mr,True
2,"Cumings, Mrs. John Bradley (Florence Briggs Th...",Mrs,True
3,"Heikkinen, Miss. Laina",Miss,False
4,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",Mrs,True
5,"Allen, Mr. William Henry",Mr,True
...,...,...,...
887,"Montvila, Rev. Juozas",Rev,
888,"Graham, Miss. Margaret Edith",Miss,False
889,"Johnston, Miss. Catherine Helen ""Carrie""",Miss,False
890,"Behr, Mr. Karl Howell",Mr,True


In [56]:
# 또는 

train.loc[train["Title"].isin(["Mr", "Mrs"]), "Married"] = True  #isin() : 값이 포함된 것을 추출
train.loc[train["Title"].isin(["Master", "Miss"]), "Married"] = False

train[["Name", "Title", "Married"]].head(10)

Unnamed: 0_level_0,Name,Title,Married
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,"Braund, Mr. Owen Harris",Mr,True
2,"Cumings, Mrs. John Bradley (Florence Briggs Th...",Mrs,True
3,"Heikkinen, Miss. Laina",Miss,False
4,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",Mrs,True
5,"Allen, Mr. William Henry",Mr,True
6,"Moran, Mr. James",Mr,True
7,"McCarthy, Mr. Timothy J",Mr,True
8,"Palsson, Master. Gosta Leonard",Master,False
9,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",Mrs,True
10,"Nasser, Mrs. Nicholas (Adele Achem)",Mrs,True


In [57]:
# Married 별 생존률
pd.pivot_table(data=train, index = "Married" , values = "Survived")

Unnamed: 0_level_0,Survived
Married,Unnamed: 1_level_1
False,0.675676
True,0.280374
