## Titanic Project

### Importing Libraries

In [7]:
import numpy as np
import pandas as pd
import seaborn as sns

### Load Dataset

In [9]:
df = sns.load_dataset('titanic')

### Check Data Types

In [11]:
df.dtypes

survived          int64
pclass            int64
sex              object
age             float64
sibsp             int64
parch             int64
fare            float64
embarked         object
class          category
who              object
adult_male         bool
deck           category
embark_town      object
alive            object
alone              bool
dtype: object

In [15]:
{col: df[col].nunique() for col in df if df[col].dtype == object}

{'sex': 2, 'embarked': 3, 'who': 3, 'embark_town': 3, 'alive': 2}

### Changing Data Types

In [70]:
df = df.astype(
    {
        'alive': 'category',
        'sex': 'category',
        'embarked': 'category',
        'who': 'category',
        'embark_town': 'category'
    }
)

In [71]:
df.sex = df.sex.cat.codes
df.alive = df.alive.cat.codes

In [58]:
df.dtypes

survived          int64
pclass            int64
sex                int8
age             float64
sibsp             int64
parch             int64
fare            float64
embarked       category
class          category
who            category
adult_male         bool
deck           category
embark_town    category
alive              int8
alone              bool
dtype: object

### Check Correlations

In [73]:
df

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,1,22.0,1,0,7.2500,S,Third,man,True,,Southampton,0,False
1,1,1,0,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,1,False
2,1,3,0,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,1,True
3,1,1,0,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,1,False
4,0,3,1,35.0,0,0,8.0500,S,Third,man,True,,Southampton,0,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,1,27.0,0,0,13.0000,S,Second,man,True,,Southampton,0,True
887,1,1,0,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,1,True
888,0,3,0,,1,2,23.4500,S,Third,woman,False,,Southampton,0,False
889,1,1,1,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,1,True


In [110]:
df.corr(numeric_only=True).style.highlight_max(color='Green')

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,adult_male,alive,alone
survived,1.0,-0.338481,-0.543351,-0.077221,-0.035322,0.081629,0.257307,-0.55708,1.0,-0.203367
pclass,-0.338481,1.0,0.1319,-0.369226,0.083081,0.018443,-0.5495,0.094035,-0.338481,0.135207
sex,-0.543351,0.1319,1.0,0.093254,-0.114631,-0.245489,-0.182333,0.908578,-0.543351,0.303646
age,-0.077221,-0.369226,0.093254,1.0,-0.308247,-0.189119,0.096067,0.280328,-0.077221,0.19827
sibsp,-0.035322,0.083081,-0.114631,-0.308247,1.0,0.414838,0.159651,-0.253586,-0.035322,-0.584471
parch,0.081629,0.018443,-0.245489,-0.189119,0.414838,1.0,0.216225,-0.349943,0.081629,-0.583398
fare,0.257307,-0.5495,-0.182333,0.096067,0.159651,0.216225,1.0,-0.182024,0.257307,-0.271832
adult_male,-0.55708,0.094035,0.908578,0.280328,-0.253586,-0.349943,-0.182024,1.0,-0.55708,0.404744
alive,1.0,-0.338481,-0.543351,-0.077221,-0.035322,0.081629,0.257307,-0.55708,1.0,-0.203367
alone,-0.203367,0.135207,0.303646,0.19827,-0.584471,-0.583398,-0.271832,0.404744,-0.203367,1.0


In [112]:
df.drop(columns='alive')

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alone
0,0,3,1,22.0,1,0,7.2500,S,Third,man,True,,Southampton,False
1,1,1,0,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,False
2,1,3,0,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,True
3,1,1,0,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,False
4,0,3,1,35.0,0,0,8.0500,S,Third,man,True,,Southampton,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,1,27.0,0,0,13.0000,S,Second,man,True,,Southampton,True
887,1,1,0,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,True
888,0,3,0,,1,2,23.4500,S,Third,woman,False,,Southampton,False
889,1,1,1,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,True


### Describe Data

In [114]:
df.describe()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,alive
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0,891.0
mean,0.383838,2.308642,0.647587,29.699118,0.523008,0.381594,32.204208,0.383838
std,0.486592,0.836071,0.47799,14.526497,1.102743,0.806057,49.693429,0.486592
min,0.0,1.0,0.0,0.42,0.0,0.0,0.0,0.0
25%,0.0,2.0,0.0,20.125,0.0,0.0,7.9104,0.0
50%,0.0,3.0,1.0,28.0,0.0,0.0,14.4542,0.0
75%,1.0,3.0,1.0,38.0,1.0,0.0,31.0,1.0
max,1.0,3.0,1.0,80.0,8.0,6.0,512.3292,1.0


>  Dataset is reletively imbalanced!

In [117]:
sum(df.embark_town.str.get(0) == df.embarked) / df.shape[0]

0.9977553310886644

In [119]:
df.drop(columns='embark_town', inplace=True)

KeyError: "['embark_town'] not found in axis"

### Missing Values