# [ 타이타닉 데이터 설명 ]
* 주요 feature - Pclass, Age, SibSp, Parch, Fare
* 예측 target label - Survived

### [ 컬럼 정보 확인하기 ]
* PassengerId : 승객 번호
* Survived : 생존여부(1: 생존, 0 : 사망)
* Pclass : 승선권 클래스(1 : 1st, 2 : 2nd ,3 : 3rd)
* Name : 승객 이름
* Sex : 승객 성별
* Age : 승객 나이
* SibSp : 동반한 형제자매, 배우자 수
* Patch : 동반한 부모, 자식 수
* Ticket : 티켓의 고유 넘버
* Fare : 티켓의 요금(탑승료)
* Cabin : 객실 번호
* Embarked : 승선한 항구명(C : Cherbourg, Q : Queenstown, S : Southampton)

In [2]:
import numpy as np
import pandas as pd

## 1. csv 데이타 파일 읽어오기

In [3]:
raw_data = pd.read_csv('./data/titanic/train.csv',index_col='PassengerId')
raw_data.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## 2. 데이터프레임 구조 확인

In [4]:
raw_data.describe()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


In [5]:
raw_data.shape

(891, 11)

In [6]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 11 columns):
Survived    891 non-null int64
Pclass      891 non-null int64
Name        891 non-null object
Sex         891 non-null object
Age         714 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
Ticket      891 non-null object
Fare        891 non-null float64
Cabin       204 non-null object
Embarked    889 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 83.5+ KB


## 3. 데이터프레임의 전체 컬럼 및 결측값(NaN) 존재 컬럼 확인

In [7]:
raw_data.columns

Index(['Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket',
       'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [8]:
raw_data.count() 
#누락된 데이터를 확인할 수 있다

Survived    891
Pclass      891
Name        891
Sex         891
Age         714
SibSp       891
Parch       891
Ticket      891
Fare        891
Cabin       204
Embarked    889
dtype: int64

In [9]:
raw_data.isnull().sum()

Survived      0
Pclass        0
Name          0
Sex           0
Age         177
SibSp         0
Parch         0
Ticket        0
Fare          0
Cabin       687
Embarked      2
dtype: int64

## 4. 결측값(NaN) 처리하기
* 'Age' 컬럼의 결측값(NaN)을 평균값으로 대체하기
* 'Cabin' 결측값 NaN 을 공백으로 교체하기

In [10]:
raw_data['Age'].fillna(round(raw_data['Age'].mean()),inplace=True)
raw_data['Cabin'].fillna(' ',inplace=True)
raw_data.isnull().sum()

Survived    0
Pclass      0
Name        0
Sex         0
Age         0
SibSp       0
Parch       0
Ticket      0
Fare        0
Cabin       0
Embarked    2
dtype: int64

In [11]:
# 1등석이고 나이가 널값인 승객에게 1등석의 나이의 평균을 넣어준다.
raw_data.loc[(raw_data['Age'].isnull()) & (raw_data['Pclass'] == 1), 'Age'] = raw_data.loc[raw_data['Pclass'] == 1]['Age'].mean()
raw_data.loc[(raw_data['Age'].isnull()) & (raw_data['Pclass'] == 2), 'Age'] = raw_data.loc[raw_data['Pclass'] == 2]['Age'].mean()
raw_data.loc[(raw_data['Age'].isnull()) & (raw_data['Pclass'] == 3), 'Age'] = raw_data.loc[raw_data['Pclass'] == 3]['Age'].mean()


## 5. 전체 생존률 구하기
* 전체 생존률 = (생존자수/전체승객인원)*100

In [12]:
raw_data['Survived'].value_counts()

0    549
1    342
Name: Survived, dtype: int64

In [13]:
round((raw_data.Survived.value_counts() / len(raw_data) * 100), 2)

0    61.62
1    38.38
Name: Survived, dtype: float64

In [14]:
round((raw_data['Survived'].sum() / len(raw_data) *100), 2)

38.38

## 6. 각 객실 등급별 분포 수 알아보기

In [15]:
p1 = len(raw_data[(raw_data['Pclass']==1)])
p2 = len(raw_data[(raw_data['Pclass']==2)])
p3 = len(raw_data[(raw_data['Pclass']==3)])
print("1등급 : %d 명" %p1)
print("2등급 : %d 명" %p2)
print("3등급 : %d 명" %p3)

1등급 : 216 명
2등급 : 184 명
3등급 : 491 명


## 7. 1등칸 / 2등칸 / 3등칸 객실 승객의 생존자 수와 비율

In [16]:
p1_alive = len(raw_data[(raw_data['Pclass']==1) & (raw_data['Survived']==1)])
p2_alive = len(raw_data[(raw_data['Pclass']==2) & (raw_data['Survived']==1)])
p3_alive = len(raw_data[(raw_data['Pclass']==3) & (raw_data['Survived']==1)])

per_p1_alive = (p1_alive / p1) * 100 
per_p2_alive = (p2_alive / p2) * 100 
per_p3_alive = (p3_alive / p3) * 100 

print("1등칸 생존자 수: {0}명, 1등칸 생존자 비율 : {1:.2f}%".format(p1_alive ,per_p1_alive))
print("2등칸 생존자 수: {0}명, 2등칸 생존자 비율 : {1:.2f}%".format(p2_alive ,per_p2_alive))
print("3등칸 생존자 수: {0}명, 3등칸 생존자 비율 : {1:.2f}%".format(p3_alive ,per_p3_alive))

1등칸 생존자 수: 136명, 1등칸 생존자 비율 : 62.96%
2등칸 생존자 수: 87명, 2등칸 생존자 비율 : 47.28%
3등칸 생존자 수: 119명, 3등칸 생존자 비율 : 24.24%


## 8. 남자와 여자의 생존률 구하기

In [17]:
m = len(raw_data[(raw_data['Sex']=='male')])
f = len(raw_data[(raw_data['Sex']=='female')])

m_alive = len(raw_data[(raw_data['Sex']=='male') & (raw_data['Survived']==1)])
f_alive = len(raw_data[(raw_data['Sex']=='female') & (raw_data['Survived']==1)])

per_m_alive = (m_alive / m) *100
per_f_alive = (f_alive / f) *100

print("남자 생존률 : %.2f %%" %per_m_alive)
print("여자 생존률 : %.2f %%" %per_f_alive)

남자 생존률 : 18.89 %
여자 생존률 : 74.20 %


## 9. 3등칸 객실 남자 승객의 생존자 수와 비율

In [18]:
p3_m = len(raw_data[(raw_data['Pclass']==3) & (raw_data['Sex']=='male')])
p3_m_alive =len(raw_data[(raw_data['Pclass']==3) & (raw_data['Sex']=='male') & (raw_data['Survived']==1)])
per_p3_m_alive = (p3_m_alive / p3_m) * 100

print("3등칸 남자 승객:{0}명 \n3등칸 남자생존자:{1}명 \n3등칸 남자생존자 비율:{2:.2f}%".format(p3_m, p3_m_alive, per_p3_m_alive))

3등칸 남자 승객:347명 
3등칸 남자생존자:47명 
3등칸 남자생존자 비율:13.54%


## 10. 탑승한 곳에 따른 생존률 구하기

In [19]:
raw_data['Embarked'] = raw_data['Embarked'].fillna('S') #NAN값은 빈도가높은 S로 설정

c = len(raw_data[(raw_data['Embarked']=='C')])
q = len(raw_data[(raw_data['Embarked']=='Q')])
s = len(raw_data[(raw_data['Embarked']=='S')])

c_alive = len(raw_data[(raw_data['Embarked']=='C') & (raw_data['Survived']==1 )])
q_alive = len(raw_data[(raw_data['Embarked']=='Q') & (raw_data['Survived']==1 )])
s_alive = len(raw_data[(raw_data['Embarked']=='S') & (raw_data['Survived']==1 )])

per_c_alive = (c_alive / c) *100
per_q_alive = (q_alive / q) *100
per_s_alive = (s_alive / s) *100

print("Cherbourg 생존자 수: {0}명, Cherbourg 생존률 : {1:.2f}%".format(c_alive ,per_c_alive))
print("Queenstown 생존자 수: {0}명, Queenstown 생존률 : {1:.2f}%".format(q_alive ,per_q_alive))
print("Southampton 생존자 수: {0}명, Southampton 생존률 : {1:.2f}%".format(s_alive ,per_s_alive))

Cherbourg 생존자 수: 93명, Cherbourg 생존률 : 55.36%
Queenstown 생존자 수: 30명, Queenstown 생존률 : 38.96%
Southampton 생존자 수: 219명, Southampton 생존률 : 33.90%


In [21]:
raw_data.corr()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare
Survived,1.0,-0.338481,-0.070657,-0.035322,0.081629,0.257307
Pclass,-0.338481,1.0,-0.329727,0.083081,0.018443,-0.5495
Age,-0.070657,-0.329727,1.0,-0.23244,-0.18033,0.090632
SibSp,-0.035322,0.083081,-0.23244,1.0,0.414838,0.159651
Parch,0.081629,0.018443,-0.18033,0.414838,1.0,0.216225
Fare,0.257307,-0.5495,0.090632,0.159651,0.216225,1.0
