## Titanic - Machine Learning from Disaster

In [None]:
#imoprt data and viz libraries 

import pandas as pd
import numpy as np

import seaborn as sns 
import matplotlib.pyplot as plt

%matplotlib inline

In [None]:
## Load our Train and Test Data

train = pd.read_csv('../data/train.csv')
test  = pd.read_csv('../data/test.csv')

In [None]:
print(train.shape, test.shape)

In [None]:
## Before starting with the EDA , lets do go for LayMan Approach. As our Kaggle Evaluation is based on Accuracy Score.

## Layman Approach -- Assume all passengers in test set survived / dead 

## Assumption 0 all dead :(

layman0_submission = test[["PassengerId"]].copy()
layman0_submission['Survived'] = 0
layman0_submission.to_csv('../data/Laymam_all_dead.csv',index=False)

# Assumption 1 all survived :D

layman1_submission = test[["PassengerId"]].copy()
layman1_submission['Survived'] = 1
layman1_submission.to_csv('../data/Laymam_all_survived.csv',index=False)

+++ Results +++

Public Score Accuracy

- Layman approach all dead ----------- 0.62200
- Layman approach all survived ------- 0.37799

```

Layman Conclusion :
We can conclude that the total number of people who Died > total number of people survived for our test dataset. Just by using a Layman all-dead hypothesis test, we can achieve an accuracy of approximately 62% on our test data.
```

### Exploratory Data Analysis on training Data

In [None]:
## View sample of our training data
train.head(3)

In [None]:
## Get insights on training data
train.info()

In [None]:
## Get insights on test data
test.info()

In [None]:
## Check for Null values in training data 
train.isnull().sum()

In [None]:
## Check for Null values in testing data 
test.isnull().sum()

#### Plotting some Visualizations about the features on our training dataset -

In [None]:
## Function to plot bar charts
def bar_chart(feature):
    survived = train[train['Survived']==1][feature].value_counts()
    dead = train[train['Survived']==0][feature].value_counts()
    df = pd.DataFrame([survived,dead])
    df.index = ['Survived','Dead']
    df.plot(kind='bar',stacked=True, figsize=(10,5))

In [None]:
bar_chart('Sex')
bar_chart('Pclass')
bar_chart('Embarked')

#### Feature engineering

In [None]:
# Commbining train and test data for feature engg.
all_data = [train,test]

# Check Title of each individual on basis of their name
for data in all_data :
    data['Status'] = data['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
    
train.Status.unique()

In [None]:
# Adding Status of each individual on basis of their name
for dataset in all_data:
    ## Assign Rare title to all the vip personnel on the ship
    dataset['Status'] = dataset['Status'].replace(['Lady', 'Countess','Capt', 'Col',\
     'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

    ## Correct spelling mistakes in title for remaining individuals
    dataset['Status'] = dataset['Status'].replace('Mlle', 'Miss')
    dataset['Status'] = dataset['Status'].replace('Ms', 'Miss')
    dataset['Status'] = dataset['Status'].replace('Mme', 'Mrs')

In [None]:
# Mapping Status of each individual on basis of their name
status_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
for dataset in all_data:
    dataset['Status'] = dataset['Status'].map(status_mapping)
    dataset['Status'] = dataset['Status'].fillna(0)

In [None]:
# Mapping FamilySize column by summing Siblings + ParentChild + Individual 
train['FamilySize'] = train ['SibSp'] + train['Parch'] + 1
test['FamilySize'] = test ['SibSp'] + test['Parch'] + 1

In [None]:
## Adding isAlone column if individual was travelling without family    
for dataset in all_data:
    dataset['IsAlone'] = 0
    dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1

In [None]:
## Adding Cabin Category
train['Cabin_category'] = train['Cabin'].astype(str).str[0]
train['Cabin_category'] = train['Cabin_category'].map({'A':1,'B':2,'C':2,'D':3,'E':4,'F':5,'G':6,'T':7})
train['Cabin_category'] = train['Cabin_category'].fillna(0)
# Cabin Grouping 
train['HasCabin'] = train['Cabin'].apply(lambda x:0 if x is np.nan else 1)


test['Cabin_category'] = test['Cabin'].astype(str).str[0]
test['Cabin_category'] = test['Cabin_category'].map({'A':1,'B':2,'C':2,'D':3,'E':4,'F':5,'G':6,'T':7})
test['Cabin_category'] = test['Cabin_category'].fillna(0)
# Cabin Grouping 
test['HasCabin'] = test['Cabin'].apply(lambda x:0 if x is np.nan else 1)

#### Filling in missing data on Age, Fare and Embarked features -

In [None]:
## Filling up missing Age values by taking a median
train["Age"].fillna(train.groupby("Status")["Age"].transform("median"), inplace=True)
test["Age"].fillna(test.groupby("Status")["Age"].transform("median"), inplace=True)


## Filling up missing Fare values by taking a median on Pclass of an individual
train['Fare'].fillna(train.groupby(['Pclass', 'Parch', 'SibSp'])['Fare'].median()[3][0][0], inplace = True)
test['Fare'].fillna(test.groupby(['Pclass', 'Parch', 'SibSp'])['Fare'].median()[3][0][0], inplace = True)

## Filling up missing Embarked values by replacing them with 'S' based on assumption that most people boarded from Southampton
train['Embarked'].fillna('S', inplace = True)
test['Embarked'].fillna('S', inplace = True)

In [None]:
## Check for Null values in training data 
train.isnull().sum()

In [None]:
## Check for Null values in testing data 
test.isnull().sum()

In [None]:
## Final insights on training Dataset
train.info()

#### Training our model and making predictions

In [None]:
# Models from SCikit -Learn
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

# Model Evaluations
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report , accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score
import warnings
warnings.filterwarnings("ignore")

In [None]:
y_full = train["Survived"]

features = ["Pclass","Sex", "Age","Fare","Embarked","IsAlone", "FamilySize", "Status","Cabin_category","HasCabin"]

X_full = pd.get_dummies(train[features])
X_test_full = pd.get_dummies(test[features])

In [None]:
## Splitting our dataset 
X_train, X_test, y_train, y_test = train_test_split(X_full, y_full, train_size=0.7, test_size=0.3,random_state=0)

In [None]:
# Create directory for different models

models = { "Logistic Regression": LogisticRegression(),
            "Naive Bayes": GaussianNB(),
            "Stochastic Gradient": SGDClassifier(),
            "KNeighbors Classifier": KNeighborsClassifier(),
            "DecisionTree Classifier": DecisionTreeClassifier(),
            "RandomForest Classifier": RandomForestClassifier(),
            "Support Vector Machine": SVC()}



# Create a function to fit and score models
def fit_and_score(models, X_train,X_test,y_train,y_test):
    """
    Fits and evaluates given machine learning models.
    models: a dict of different SCikit-Learn machine learning models
    X_train : training data(no labels)
    X_test: testing data (no labels)
    y_train: training labels
    y_test: test labels
    """
    # set random seeed
    np.random.seed(42)
    # Make a dictionary to keep model scores
    model_scores = {}
    # Loop through models
    for name, model in models.items():
        # Fit the model to the data
        model.fit(X_train,y_train)
        # Evaluate the model and append its score to model scores
        model_scores[name] = model.score(X_test,y_test)
    return model_scores

In [None]:
## Train our data on different classifiers 

model_scores = fit_and_score(models=models,
                             X_train= X_train,
                             X_test= X_test,
                            y_train=y_train,
                            y_test=y_test)
model_scores

In [None]:
model_compare = pd.DataFrame(model_scores, index=["accuracy"])
model_compare.T.plot.bar();

####  Hyerparameter Tuning for Random Forest model as we have best acc score for classifier

In [None]:
# Create a hyperparameter grid for RandomForestClassifier
rf_grid = {"n_estimators": np.arange(10, 1000, 50),
           "max_depth": np.arange(1, 50, 2),
           "min_samples_split": np.arange(2, 50, 2),
           "min_samples_leaf": np.arange(1, 20, 2)}

In [None]:
# Tune RandomForestClassifier
# Setup random seed
np.random.seed(3)

# Setup random hyperparameter search for RandomForestClassifier
rs_rf = RandomizedSearchCV(RandomForestClassifier(), 
                           param_distributions=rf_grid,
                           cv=5,
                           n_iter=20,
                           verbose=True, n_jobs=-1)

# Fit random hyperparameter search model for RandomForestClassifier()
rs_rf.fit(X_train, y_train)

In [None]:
# Find the best hyperparameters
rs_rf.best_params_

In [None]:
# Evaluate the randomized search RandomForestClassifier model
rs_rf.score(X_test, y_test)

#### Hyperparamter Tuning with GridSearchCV

In [None]:
# Different hyperparameters for our LogisticRegression model
rf_grid = {"n_estimators": np.arange(100, 400, 50),
           "max_depth": np.arange(10,20, 2),
           "min_samples_split": np.arange(20,30, 2),
           "min_samples_leaf": np.arange(2, 10, 2),
          "random_state": [0] }

# Setup grid hyperparameter search for LogisticRegression
gs_rf = GridSearchCV(RandomForestClassifier(),
                          param_grid=rf_grid,
                          cv=5,
                          verbose=True,n_jobs=-1)

# Fit grid hyperparameter search model
gs_rf.fit(X_train, y_train);

In [None]:
# Check the best hyperparmaters
gs_rf.best_params_

In [None]:
# Evaluate the GridSearchCV search RandomForestClassifier model
gs_rf.score(X_test, y_test)

In [None]:
# Make predictions with tuned model
y_preds = gs_rf.predict(X_valid)

In [None]:
# Import Seaborn
import seaborn as sns

# Increase font size
sns.set(font_scale=1.5) 
def plot_conf_mat(y_test, y_preds):
    """
    Plots a confusion matrix using Seaborn's heatmap().
    """
    fig, ax = plt.subplots(figsize=(3, 3))
    ax = sns.heatmap(confusion_matrix(y_test, y_preds),
                     annot=True, # Annotate the boxes
                     cbar=False)
    plt.xlabel("Predicted label") # predictions go on the x-axis
    plt.ylabel("True label") # true labels go on the y-axis 
    
plot_conf_mat(y_valid, y_preds)

In [None]:
print(classification_report(y_valid, y_preds))

In [None]:
# Create a new classifier with best parameters
clf = RandomForestClassifier(max_depth= 18,
 min_samples_leaf= 2,
 min_samples_split = 26,
 n_estimators = 100)

In [None]:
# Cross-validated accuracy
cv_acc = cross_val_score(clf,
                         X_full,
                         y_full,
                         cv=5,
                         scoring="accuracy")
cv_acc


In [None]:
cv_acc = np.mean(cv_acc)
cv_acc

In [None]:
# Cross-validated precision
cv_precision = cross_val_score(clf,
                         X_full,
                         y_full,
                         cv=5,
                         scoring="precision")
cv_precision=np.mean(cv_precision)
cv_precision

In [None]:
# Cross-validated recall
cv_recall = cross_val_score(clf,
                         X_full,
                         y_full,
                         cv=5,
                         scoring="recall")
cv_recall = np.mean(cv_recall)
cv_recall

In [None]:
# Cross-validated f1-score
cv_f1 = cross_val_score(clf,
                         X_full,
                         y_full,
                         cv=5,
                         scoring="f1")
cv_f1 = np.mean(cv_f1)
cv_f1

In [None]:
# Visualize cross-validated metrics
cv_metrics = pd.DataFrame({"Accuracy": cv_acc,
                           "Precision": cv_precision,
                           "Recall": cv_recall,
                           "F1": cv_f1},
                          index=[0])

cv_metrics.T.plot.bar(title="Cross-validated classification metrics",
                      legend=False);

In [None]:
clf.fit(X_train, y_train)

#### Final RF model with Accuracy

In [None]:
# clf = RandomForestClassifier(n_estimators = 500, max_depth = 10, min_samples_split = 6 )

clf.fit(X_train, y_train)

rf_val_predictions = clf.predict(X_valid)

rf_accuracy = accuracy_score(rf_val_predictions,y_valid)

rf_accuracy

#### Predicting and Exporting our final submission

In [None]:
clf.fit(X_full, y_full)
predictions = clf.predict(X_test_full)

In [None]:
output = pd.DataFrame({'PassengerId': test.PassengerId, 'Survived': predictions})
output.to_csv('../data/Tuned_RF_Model.csv', index=False)