# Titanic - Machine Learning Project (UT)
---

In [None]:
import os
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

---

## Plan

1) **Input Data** - for importing our train and test data set.

2) **Data Exploration** - for exploring our train data set 

3) **Feature Engineering** - for merging and removing columns

4) **Data Processing** - preparing data for model fitting

---

### Input Data

In [None]:
os.listdir('inputs/')

In [None]:
train_df = pd.read_csv("./inputs/train.csv")
test_df = pd.read_csv("./inputs/test.csv")

In [None]:
data_list = [train_df, test_df]

In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
test_df.info()

---

### Data Exploration

In [None]:
print(f"Train data is {train_df.shape[0]} rows, with {train_df.shape[1]} columns")
print(f"Test data is {test_df.shape[0]} rows, with {test_df.shape[1]} columns")

In [None]:
train_df.info()

In [None]:
test_df.info()

In [None]:
train_df.describe()

In [None]:
numeric_columns = train_df[['Age','SibSp','Parch','Fare']]
categorical_columns = train_df[['Survived','Pclass','Sex','Ticket','Cabin','Embarked']]

In [None]:
for i in numeric_columns.columns:
    plt.hist(numeric_columns[i])
    plt.title(i)
    plt.show()

In [None]:
pd.pivot_table(train_df, index = 'Survived', values = ['Age','SibSp','Parch','Fare'])

In [None]:
pd.pivot_table(train_df, index = 'Survived', columns = 'Pclass', values = 'PassengerId' ,aggfunc ='count')


In [None]:
pd.pivot_table(train_df, index = 'Survived', columns = 'Sex', values = 'PassengerId' ,aggfunc ='count')


In [None]:
pd.pivot_table(train_df, index = 'Survived', columns = 'Embarked', values = 'PassengerId' ,aggfunc ='count')


In [None]:
train_df[['Pclass', 'Survived']].groupby(['Pclass'], as_index=False).mean().sort_values(by='Survived', ascending=False)

In [None]:
sns.heatmap(train_df.corr())

In [None]:
for place in train_df['Embarked'].unique()[0:3]:
    place_count = len(train_df[(train_df.Embarked == place)])
    a = len(train_df)
    print(f"Number of people from {place} are {place_count*100/a}%")

## Feature Engineering
---

In [None]:
# Merging Parent Children and Sibling Spouse to FamilyOnBoard
train_df['FamilyOnBoard']= train_df.SibSp + train_df.Parch
test_df['FamilyOnBoard']= test_df.SibSp + test_df.Parch

In [None]:
plot = train_df.plot.scatter('FamilyOnBoard','Survived')

In [None]:
train_df.info()

In [None]:
train_df['cabin_adv'] = train_df.Cabin.apply(lambda x: str(x)[0])
test_df['cabin_adv'] = test_df.Cabin.apply(lambda x: str(x)[0])

In [None]:
pd.pivot_table(train_df,index='Survived',columns='cabin_adv', values = 'Name', aggfunc='count')

### Data Processing

---


In [None]:
train_df = train_df.drop(labels='Parch', axis=1)
test_df = test_df.drop(labels='Parch', axis=1)

train_df = train_df.drop(labels='SibSp', axis=1)
test_df = test_df.drop(labels='SibSp', axis=1)

In [None]:
def fill_empty_Median(dataFrame, column, groupColumns ):
    dataFrame[column] = dataFrame[column].fillna(dataFrame.groupby(groupColumns)[column].transform('median'))
    return dataFrame[column]

In [None]:
fig = plt.figure(figsize = (12,9))
sns.histplot(train_df["Age"], kde=True, palette='BuPu_r')
plt.title('Age hist Before filling')
plt.show()

In [None]:
train_df['Age'] = fill_empty_Median(train_df, "Age", ['FamilyOnBoard', 'Sex', 'Pclass', 'Fare'])
train_df['Age'] = train_df['Age'].fillna(train_df['Age'].median())

test_df['Age'] = fill_empty_Median(test_df, "Age", ['FamilyOnBoard', 'Sex', 'Pclass', 'Fare'])
test_df['Age'] = test_df['Age'].fillna(test_df['Age'].median())

In [None]:
train_df.info()

In [None]:
fig = plt.figure(figsize = (12,9))
sns.histplot(train_df["Age"], kde=True, palette='BuPu_r')
plt.title('Age hist After filling')
plt.show()

In [None]:
train_df = train_df.drop(labels='Cabin', axis=1)
test_df = test_df.drop(labels='Cabin', axis=1)

In [None]:
train_df.info()

In [None]:
test_df.info()

In [None]:
test_df.Fare = test_df.Fare.fillna(train_df.Fare.median())

### Building Models

---

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [None]:
features = ['Pclass', "Sex", 'Age', "Fare", "Embarked", "FamilyOnBoard", 'cabin_adv', 'Survived']

#split valid train set into train and validation parts
train_df = pd.get_dummies(train_df[features])

train_df, train_df_val = train_test_split(train_df, random_state = 111, test_size = 0.20)

y = train_df["Survived"]
x = train_df.drop(columns=['Survived'])

y_val = train_df_val["Survived"]
x_val = train_df_val.drop(columns=['Survived'])

features.remove("Survived")

test_x = pd.get_dummies(test_df[features])

In [None]:
train_df.info()

In [None]:
train_df_val.info()

In [None]:
lr = LogisticRegression(max_iter=2000)
cv = cross_val_score(lr,x,y,cv=5)

print('-'*40)
for val in enumerate(cv):
    print(f"Accuracy #{val[0]}: {val[1]} ")

print('-'*40)
print(f"Mean value: {cv.mean()}")
x.info()

In [None]:
knn = KNeighborsClassifier()
cv = cross_val_score(knn,x,y,cv=5)

print('-'*40)
for val in enumerate(cv):
    print(f"Accuracy #{val[0]}: {val[1]} ")

print('-'*40)
print(f"Mean value: {cv.mean()}")

In [None]:
rfc = RandomForestClassifier(n_estimators=100, max_depth=3, random_state=1)

cv = cross_val_score(rfc, x, y, cv=5)

print('-'*40)
for val in enumerate(cv):
    print(f"Accuracy #{val[0]}: {val[1]} ")

print('-'*40)
print(f"Mean value: {cv.mean()}")

In [None]:
svc = SVC(probability = True)
cv = cross_val_score(svc,x,y,cv=5)

print('-'*40)
for val in enumerate(cv):
    print(f"Accuracy #{val[0]}: {val[1]} ")

print('-'*40)
print(f"Mean value: {cv.mean()}")

In [None]:
from xgboost import XGBClassifier
xgb = XGBClassifier(random_state =1)
cv = cross_val_score(xgb,x,y,cv=5)

print('-'*40)
for val in enumerate(cv):
    print(f"Accuracy #{val[0]}: {val[1]} ")

print('-'*40)
print(f"Mean value: {cv.mean()}")

### Custom Neural Network

---


In [None]:
from keras.models import Sequential
from keras.layers import Dense

model = Sequential()
n_features = x.columns.size
model.add(Dense(n_features, activation='relu', input_shape=(n_features,)))

model.add(Dense(n_features, activation='relu'))

model.add(Dense(1, activation='sigmoid'))

#model
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
                   
model._estimator_type = "classifier"

In [None]:
model.fit(x, y, epochs=50, batch_size=4, verbose=1)

In [None]:
print(model.evaluate(x, y))