In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('seaborn-v0_8')
sns.set(font_scale = 2.5)

import missingno as msno

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

## 순서

1. 데이터셋 확인

2. EDA

3. feature engineering

4. model 만들기

5. 모델 학습 및 예측

6. 모델 평가

In [None]:
path = '/content/drive/MyDrive/Kaggle/Kaggle_Competition/Code_Practice/dataset/titanic/'

In [None]:
# 1. 데이터셋 확인

df_train = pd.read_csv(path + "train.csv")
df_test = pd.read_csv(path + "test.csv")

In [None]:
df_train.head()

In [None]:
df_train.describe()
df_test.describe()

In [None]:
# 1.1 Null data check
for col in df_train.columns:
    msg = 'column: {:>10}\t Percent of NaN value {:.2f}%'.format(
        col , 100 * (df_train[col].isnull().sum() / df_train[col].shape[0]))
    print(msg)

In [None]:
for col in df_test.columns:
    msg = 'column: {:>10}\t Percent of NaN value {:.2f}%'.format(
        col , 100 * (df_test[col].isnull().sum() / df_test[col].shape[0]))
    print(msg)

In [None]:
msno.matrix(df = df_train.iloc[: , :] ,
            figsize = (8 , 8) , color = (0.8 , 0.5 , 0.2))

In [None]:
msno.bar(df = df_train.iloc[: , :] ,
         figsize = (8 , 8) , color = (0.8 , 0.5 , 0.2))

In [None]:
msno.bar(df = df_test.iloc[: , :] ,
         figsize = (8 , 8) , color = (0.8 , 0.5 , 0.2))

In [None]:
# 1.2 Target label 확인

f , ax = plt.subplots(1 , 2 , figsize = (18 , 8))

df_train['Survived'].value_counts().plot.pie(
    explode = [0 , 0.1] ,
    autopct = '%1.1f%%' ,
    ax = ax[0] ,
    shadow = True
)
ax[0].set_title('Pie plot - Survived')
ax[0].set_ylabel('')
sns.countplot(x = df_train['Survived'] , ax = ax[1])
ax[1].set_title('Count plot - Survived')

plt.show()

In [None]:
# 2. EDA

# 2.1 Pclass

df_train[['Pclass' , 'Survived']].groupby(['Pclass'] , as_index = True).count()

In [None]:
df_train[['Pclass' , 'Survived']].groupby(['Pclass'] , as_index = True).sum()

In [None]:
pd.crosstab(df_train['Pclass'] , df_train['Survived'] , margins = True).style.background_gradient(
    cmap = 'summer_r'
)

In [None]:
df_train[['Pclass' , 'Survived']].groupby(['Pclass'] , as_index = True).mean().sort_values(by = 'Survived' , ascending = False).plot.bar()

In [None]:
y_position = 1.02
f , ax = plt.subplots(1,  2 , figsize = (18 , 8))
df_train['Pclass'].value_counts().plot.bar(
    color = ['#CD7F32' , '#FFDF00' , '#D3D3D3'] , ax = ax[0]
)
ax[0].set_title('Number of Passengers By Pclass' , y = y_position)
ax[0].set_ylabel('Count')
sns.countplot(x = 'Pclass' , hue = 'Survived' , data = df_train , ax = ax[1])
ax[1].set_title('Pclass : Survived vs Dead' , y = y_position)

plt.show()

In [None]:
# 2.2 Sex
f , ax = plt.subplots(1 , 2 , figsize = (18 , 8))
df_train[['Sex' , 'Survived']].groupby(['Sex'] , as_index = True).mean().plot.bar(ax = ax[0])
ax[0].set_title('Survived vs Sex')
sns.countplot(x = 'Sex' , hue = 'Survived' , data = df_train , ax = ax[1])
ax[1].set_title('Sex : Survived vs Dead')
plt.show()

In [None]:
df_train[['Sex' , 'Survived']].groupby(['Sex'] , as_index = False).mean().sort_values(by = 'Survived' , ascending = False)

In [None]:
pd.crosstab(df_train['Sex'] , df_train['Survived'] , margins = True).style.background_gradient(cmap = 'summer_r')

In [None]:
# 2.3 Both Sex and Pclass
sns.pointplot(
    data = df_train ,
    x = 'Pclass' ,
    y = 'Survived' ,
    hue = 'Sex'
)
plt.legend(loc = 'upper right')