<a href="https://colab.research.google.com/github/Dimildizio/WorksOnMyMachine/blob/main/TitanicEDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [457]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

%matplotlib inline

In [226]:
train_data = pd.read_csv("https://raw.githubusercontent.com/Dimildizio/WorksOnMyMachine/main/data/train.csv", encoding='cp437')
test_data = pd.read_csv("https://raw.githubusercontent.com/Dimildizio/WorksOnMyMachine/main/data/test.csv", encoding='cp437')

In [227]:
print(f'train shape: {train_data.shape}\ntest shape: {test_data.shape}')

train shape: (891, 12)
test shape: (418, 11)


In [228]:
train_data.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [229]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [477]:
null_counts = pd.DataFrame()
null_counts['Train'] = train_data.isnull().sum()
null_counts['Test'] = test_data.isnull().sum()

Unnamed: 0,Train,Test
PassengerId,0,0.0
Survived,0,
Pclass,0,0.0
Name,0,0.0
Sex,0,0.0
Age,177,86.0
SibSp,0,0.0
Parch,0,0.0
Ticket,0,0.0
Fare,0,1.0


In [232]:
print(pd.concat([train_data.drop('Survived',axis=1)['Name'], test_data['Name']], axis=0).nunique() - (train_data.shape[0]+test_data.shape[0]))

-2


In [233]:
pd.concat([train_data[train_data['Name'].isin(test_data['Name'])], test_data[test_data['Name'].isin(train_data['Name'])]])

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
289,290,1.0,3,"Connolly, Miss. Kate",female,22.0,0,0,370373,7.75,,Q
696,697,0.0,3,"Kelly, Mr. James",male,44.0,0,0,363592,8.05,,S
0,892,,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
6,898,,3,"Connolly, Miss. Kate",female,30.0,0,0,330972,7.6292,,Q


In [234]:
train_data.groupby(['Pclass', 'Survived'])['Survived'].count()

Pclass  Survived
1       0            80
        1           136
2       0            97
        1            87
3       0           372
        1           119
Name: Survived, dtype: int64

In [235]:
train_data.groupby(['Parch', 'Survived'])['Survived'].count()

Parch  Survived
0      0           445
       1           233
1      0            53
       1            65
2      0            40
       1            40
3      0             2
       1             3
4      0             4
5      0             4
       1             1
6      0             1
Name: Survived, dtype: int64

In [236]:
train_data.groupby(['Sex', 'Survived'])['Survived'].count()

Sex     Survived
female  0            81
        1           233
male    0           468
        1           109
Name: Survived, dtype: int64

In [237]:
train_data.groupby(['Embarked', 'Survived'])['Survived'].count()

Embarked  Survived
C         0            75
          1            93
Q         0            47
          1            30
S         0           427
          1           217
Name: Survived, dtype: int64

In [238]:
train_data[train_data['Ticket'].duplicated()].sort_values(by='Ticket').head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
504,505,1,1,"Maioni, Miss. Roberta",female,16.0,0,0,110152,86.5,B79,S
759,760,1,1,"Rothes, the Countess. of (Lucy Noel Martha Dye...",female,33.0,0,0,110152,86.5,B77,S
558,559,1,1,"Taussig, Mrs. Emil (Tillie Mandelbaum)",female,39.0,1,1,110413,79.65,E67,S
585,586,1,1,"Taussig, Miss. Ruth",female,18.0,0,2,110413,79.65,E68,S
475,476,0,1,"Clifford, Mr. George Quincy",male,,0,0,110465,52.0,A14,S


In [239]:
test_data[test_data['Ticket'].duplicated()].sort_values(by='Ticket').head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
402,1294,1,"Gibson, Miss. Dorothy Winifred",female,22.0,0,1,112378,59.4,,C
75,967,1,"Keeping, Mr. Edwin",male,32.5,0,0,113503,211.5,C132,C
218,1110,1,"Widener, Mrs. George Dunton (Eleanor Elkins)",female,50.0,1,1,113503,211.5,C80,C
407,1299,1,"Widener, Mr. George Dunton",male,50.0,1,1,113503,211.5,C80,C
306,1198,1,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S


In [481]:
print('Test titles:', test_data['Name'].apply(lambda x: x.split(',')[1].split('.')[0].strip()).unique())
print('Train title:', train_data['Name'].apply(lambda x: x.split(',')[1].split('.')[0].strip()).unique())

Test titles: ['Mr' 'Mrs' 'Miss' 'Master' 'Ms' 'Col' 'Rev' 'Dr' 'Dona']
Train title: ['Mr' 'Mrs' 'Miss' 'Master' 'Don' 'Rev' 'Dr' 'Mme' 'Ms' 'Major' 'Lady'
 'Sir' 'Mlle' 'Col' 'Capt' 'the Countess' 'Jonkheer']


In [241]:
test_data[test_data['Name'].str.contains('|'.join(['Dona']))]

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
239,1131,1,"Douglas, Mrs. Walter Donald (Mahala Dutton)",female,48.0,1,0,PC 17761,106.425,C86,C
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9,C105,C


## Predict age NaNs

In [365]:
def encode(data, columns_to_encode):
  data = data.copy()
  label_encoder = LabelEncoder()
  for column in columns_to_encode:
    data[column] = label_encoder.fit_transform(data[column].astype(str))
  return data

In [463]:
def predict_age(data):

  df = data.copy()
  df = encode(df, ['Sex', 'Title', 'Deck', 'Embarked'])
  to_drop = ['Age']
  X = df.dropna(subset=['Age']).drop(to_drop, axis=1)
  y = df.dropna(subset=['Age'])['Age']
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

  model = RandomForestRegressor()
  model.fit(X_train, y_train)
  #print('error on age prediction:', mean_squared_error(y_test, model.predict(X_test)))

  X_pred = df.loc[df['Age'].isnull()].drop(to_drop, axis=1)
  predicted = model.predict(X_pred)
  df.loc[df['Age'].isnull(), 'Age'] = np.round(predicted).astype('int32')
  return df


## Modify columns

In [473]:
def change_dtypes(df):
  for column in df.columns:
      if column == 'PersonFare':
        df[column] = df[column].astype('float16')
      elif df[column].dtype == 'int64' or df[column].dtype == 'float64':
          df[column] = df[column].astype('int16')
      elif df[column].dtype == 'object':
        df[column] = df[column].astype('category')
  return df


def get_title(df):
  df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
  df['Title'] = df['Title'].replace(['Dr', 'Col', 'Sir', 'Major', 'Master'], 'Mr')
  df['Title'] = df['Title'].replace(['Capt', 'Don', 'Jonkheer'], 'Rev')
  df['Title'] = df['Title'].replace(['Ms', 'Lady', 'Mlle', 'Countess', 'Mme', 'Dona'], 'Rev_fem')
  return df


def get_family(df):
  df['Family_size'] = df['Parch']+df['SibSp'] + 1
  return df


def get_deck(df):
  df['Cabin'].fillna('Z', inplace=True)
  df['Deck'] = df['Cabin'].str.extract(r'([A-Za-z]+)')
  return df


def drop_useless(df):
  df = df.drop(['PassengerId', 'Name', 'Cabin', 'Ticket'], axis=1)
  return df


def get_age_buckets(df):
  #labels are 'Child', 'Young Adult', 'Adult', 'Senior'
  df['AgeBucket'] = pd.cut(df['Age'], bins=[0, 18, 30, 50, 80], labels=[1,2,3,4])
  #print('empty buckets:', df[df['AgeBucket'].isna()])
  df['AgeBucket'] = df['AgeBucket'].astype('int16')
  return df


def get_fare_per_person(df):
  df['PersonFare'] = df['Fare'] / df['Family_size']
  return df


def last_y(df):
    survived = df['Survived']
    df = df.drop('Survived', axis=1)
    df['Survived'] = survived
    return df

def fill_na(df):
  df['Fare'].fillna(df['Fare'].mean(), inplace=True)
  df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)
  return df


def apply_all(data, y=False):
  df = data.copy()
  for func in [fill_na, get_title, get_family, get_fare_per_person, get_deck,
               drop_useless, predict_age, get_age_buckets, change_dtypes]:
    df = func(df)
  if y:
    df = last_y(df)
  return df

In [474]:
df = apply_all(train_data, y=True)
test_df = apply_all(test_data)

In [475]:
age_stats = df.groupby('AgeBucket')['Age'].describe()
age_stats

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
AgeBucket,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,161.0,10.838509,6.290768,0.0,4.0,12.0,17.0,18.0
2,341.0,24.651026,3.406513,19.0,22.0,25.0,27.0,30.0
3,319.0,38.231975,5.951279,30.0,33.0,37.0,42.0,50.0
4,70.0,58.457143,6.316956,51.0,54.0,57.0,62.0,80.0


In [476]:
data_info = pd.concat([test_df.dtypes, test_df.nunique()], axis=1)
data_info.columns = ['Datatype', 'Unique_num']
data_info

Unnamed: 0,Datatype,Unique_num
Pclass,int16,3
Sex,int16,2
Age,int16,64
SibSp,int16,7
Parch,int16,8
Fare,int16,77
Embarked,int16,3
Title,int16,5
Family_size,int16,9
PersonFare,float16,181


In [479]:
df.describe()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title,Family_size,PersonFare,Deck,AgeBucket,Survived
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,2.308642,0.647587,29.673401,0.523008,0.381594,31.785634,1.536476,0.976431,1.904602,19.921875,6.716049,2.334456,0.383838
std,0.836071,0.47799,13.876338,1.102743,0.806057,49.70373,0.791503,0.667466,1.613459,inf,2.460739,0.861363,0.486592
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
25%,2.0,0.0,21.0,0.0,0.0,7.0,1.0,1.0,1.0,7.25,8.0,2.0,0.0
50%,3.0,1.0,28.0,0.0,0.0,14.0,2.0,1.0,1.0,8.296875,8.0,2.0,0.0
75%,3.0,1.0,37.0,1.0,0.0,31.0,2.0,1.0,2.0,23.671875,8.0,3.0,1.0
max,3.0,1.0,80.0,8.0,6.0,512.0,2.0,4.0,11.0,512.5,8.0,4.0,1.0


## IQR and outliers

# Get results

In [416]:
ids = test_data['PassengerId']