In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

# Import modules for future uses
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# Read files (in kaggle way)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# read train.csv as training data in the format as data frame
train_data = pd.read_csv("/kaggle/input/titanic/train.csv")
# use head() to get the top of the data
train_data.head()
# print(train_data.shape) # 891 

In [None]:
# read test.csv as testing data in the format of pd data frame
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")
# use head() to get the top of the data
test_data.head()
# print(test_data.shape) # 418

In [None]:
# Check data to find any data missing (NaN)

# Use df.describe to show descriptive stat for NUMERICAL VAR ONLY
# use include=all to show CATEGORICAL VAR too
train_data.describe(include='all')
# Found NaNs in Age, Name, Ticket, Cabin, Embarked

In [None]:
# Handle missing/ duplicate data

# Count all null values using df.isnull().sum()
print('\nNull values in training data: \n{0}'.format(train_data.isnull().sum()))
# Found Null values in Age(177), Cabin (687) and Embarked (2)
print('\nNull values in testing data:  \n{0}'.format(test_data.isnull().sum()))
# Found Null values in Age(86), Fare(1), Cabin (327)

# Count all duplicated values using df.duplicated().sum()
print('\nDuplicated Values in training data {0}'.format(train_data.duplicated().sum()))
print('Duplicated values in testing data {0}'.format(test_data.duplicated().sum()))
# Found No duplicate data below

In [None]:
# Filling Data to make it complete

# 1. Fill Embarked in training by using the most common port of embarkment ()
# Find the most common port in training data
# Use the df[column name].value_counts() to show the counts of each value in col
# Then use .idxmax() to return the mode (most common) value
mode=train_data["Embarked"].value_counts().idxmax()

# Replace null with the mode value using df[Column name].fillna
# MAKE SURE use the argument inplace=True to make sure the replacement is done on current dataFrame
# Otherwise it will make a copy of dataframe (which you can assign it)
train_data["Embarked"].fillna(mode, inplace=True)

# check whether null values exist (for testing)
# print(train_data["Embarked"].isnull().sum()==0)

# 2. Fill Fare with the average value of all fares
# Find the average fare value in testing data
# Use the df[column name].mean to show the mean of all values in a col
average=test_data["Fare"].mean()

# Just like above
test_data["Fare"].fillna(average,inplace=True)

# checking incase
# print(test_data["Fare"].isnull().sum()==0)

# 3. Fill Age
# First find the average age for each name title, then put mean age back to missing data based on its name title

# 3.1 Find mean age for each name title
# find values in column A based on a conidition in column B:
# df[conidition on B][column A]
# use mean() to find average and round() to make sure it's whole #

mean_age_mr=train_data[train_data["Name"].str.contains('Mr.', na=False)]["Age"].mean().round()
mean_age_mrs=train_data[train_data["Name"].str.contains('Mrs.', na=False)]["Age"].mean().round()
mean_age_miss=train_data[train_data["Name"].str.contains('Miss.', na=False)]["Age"].mean().round()
mean_age_master=train_data[train_data["Name"].str.contains('Master.', na=False)]["Age"].mean().round()
mean_age_dr=train_data[train_data["Name"].str.contains('Dr.', na=False)]["Age"].mean().round()

# Rest filled with non NaN values
mean_age_rev=train_data[train_data["Name"].str.contains('Rev.', na=False)]["Age"].mean().round()
mean_age_mme=train_data[train_data["Name"].str.contains('Mme.', na=False)]["Age"].mean().round()
mean_age_major=train_data[train_data["Name"].str.contains('Major.', na=False)]["Age"].mean().round()
mean_age_lady=train_data[train_data["Name"].str.contains('Lady.', na=False)]["Age"].mean().round()
mean_age_sir=train_data[train_data["Name"].str.contains('Sir.', na=False)]["Age"].mean().round()
mean_age_col=train_data[train_data["Name"].str.contains('Col.', na=False)]["Age"].mean().round()
mean_age_mlle=train_data[train_data["Name"].str.contains('Mlle.', na=False)]["Age"].mean().round()
mean_age_capt=train_data[train_data["Name"].str.contains('Capt.', na=False)]["Age"].mean().round()
mean_age_countess=train_data[train_data["Name"].str.contains('Countess.', na=False)]["Age"].mean().round()

# replace empty age with the mean age based on name title

# print(train_data["Age"].isnull().sum()) # 177

# function to replace values from NaN with means
title_list=['Mrs.','Miss.', 'Mr.', 'Master.', 'Dr.']
mean_list=[mean_age_mrs, mean_age_miss, mean_age_mr, mean_age_master, mean_age_dr]

for i in range(len(title_list)):
    train_data.loc[train_data["Name"].str.contains(title_list[i], na=False) & train_data["Age"].isna(),"Age"]=mean_list[i]

# print(train_data["Age"].isnull().sum()) # 0

# 4. Actually Fill Cabin

# Fill cabin by check their fare to guess their cabin

# print mean fare for each cabin

# mean_fare_cabin_a=train_data[train_data["Cabin"]=='A']["Fare"].mean() # left wasted due to no correlation
mean_fare_cabin_b=train_data[train_data["Cabin"]=='B']["Fare"].mean()
mean_fare_cabin_c=train_data[train_data["Cabin"]=='C']["Fare"].mean()
mean_fare_cabin_d=train_data[train_data["Cabin"]=='D']["Fare"].mean()
mean_fare_cabin_e=train_data[train_data["Cabin"]=='E']["Fare"].mean()
mean_fare_cabin_f=train_data[train_data["Cabin"]=='F']["Fare"].mean()
mean_fare_cabin_g=train_data[train_data["Cabin"]=='G']["Fare"].mean()

# assign cabin base on fares, replace all unknown fare 

# print(train_data.loc[(train_data["Cabin"]=='X')]["Cabin"].value_counts().to_dict())
train_data.loc[(train_data["Cabin"]=='X')&(train_data["Fare"]>=mean_fare_cabin_b), "Cabin"]="B"
train_data.loc[(train_data["Cabin"]=='X')&(train_data["Fare"]<mean_fare_cabin_b) & (train_data["Fare"]>=mean_fare_cabin_c), "Cabin"]="C"
train_data.loc[(train_data["Cabin"]=='X')&(train_data["Fare"]<mean_fare_cabin_c) & (train_data["Fare"]>=mean_fare_cabin_d), "Cabin"]="D"
train_data.loc[(train_data["Cabin"]=='X')&(train_data["Fare"]<mean_fare_cabin_d) & (train_data["Fare"]>=mean_fare_cabin_e), "Cabin"]="E"
train_data.loc[(train_data["Cabin"]=='X')&(train_data["Fare"]<mean_fare_cabin_e) & (train_data["Fare"]>=mean_fare_cabin_f), "Cabin"]="F"
train_data.loc[(train_data["Cabin"]=='X')&(train_data["Fare"]<mean_fare_cabin_f) & (train_data["Fare"]>=mean_fare_cabin_g), "Cabin"]="G"

# print(train_data[(train_data["Cabin"]=='X')&train_data["Fare"]>mean_fare_cabin_g]) # Empty data frame check to ensure all fare above the mean of cabin G has assigned to a new cabin



In [None]:
# How to Fill Cabin (supplementry): 

import seaborn as sns
import matplotlib.pyplot as plt

# a. first group the unknown data into a group
# for known data, keep the first character
# Then display using seaborn's boxplot

# Training and testing data
train_data["Cabin"]=pd.Series('X'if pd.isnull(each) else each[0] for each in train_data["Cabin"])
test_data["Cabin"]=pd.Series('X'if pd.isnull(each) else each[0] for each in test_data["Cabin"])

# Plot Cabin data with fare to check how the unknown usually belong
plt.figure(figsize=(12,5))
plt.title('Box Plot of Temperatures by Modules')
sns.boxplot(x='Cabin',y='Fare',data=train_data, palette='Set2')
plt.tight_layout()

# Result shows most from the unknown class (X) has a low fare
        
        

In [None]:
# Check entire dataset to make sure to null value

train_data.isnull().sum()
# Return 0 for every column

In [None]:
# Feature Engineering: 
# * analyze features and extract impactful info from it
# * Even create new features from existing one

# 1. Analyze through graphs (Visualization) through sns and plt
import matplotlib.pyplot as plt
import seaborn as sns


# Create a figure and a set of subplots using plt.subplots
# plt.subplots(row, column, figsize(x,y))
fig, axx=plt.subplots(1,3, figsize=(20,5))

# Using sns.histplot (histogram) or countplot (to show different colors)
# Make a graph SibSp X count with title
sns.countplot(data=train_data, x="SibSp", ax=axx[0]).set(title="Number of Sibilings/ Spouses")

# Make a graph Parch X count with title
sns.countplot(data=train_data, x="Parch", ax=axx[1]).set(title="Number of Parents/ Child")

# Make a graph Pclass X count with title
sns.countplot(data=train_data, x="Pclass", ax=axx[2]).set(title="Distribution of Classes")

# use plt.tight_layout to adjust padding among subplots
plt.tight_layout()

# Insights learned from the plots:
1. Most people are in 3rd class
2. Most people don't have parents/children 
3. Most people don't have sibilings/spouses

# Conclusions made
1. can create a feature to determine whether the passenger is alone or not (with family)

In [None]:
# Create alone feature for both training and testing data

# For training data
# Way 1. By creating a function and use .apply to apply 
# def create_alone_feature(SibSp_Parch):
#     if (SibSp_Parch[0]+SibSp_Parch[1])==0:
#         return 1
#     else:
#         return 0
# train_data['Alone'] = train_data[['SibSp','Parch']].apply(create_alone_feature, axis=1)
 
# way 2: use np.where
# np.where(condition, value if condition is true, value if condition is false)
train_data['Alone']=np.where((train_data['SibSp']+train_data['Parch']==0),1, 0)
# FamilySize: 1 + # of SibSp + # of ParCh
# make new column based on arithmetic, vector addition
train_data['FamilySize'] = 1 + train_data['SibSp'] + train_data['Parch']

# verified using df.equals
# print(train_data['Alone'].equals(train_data['Alone2']))

# For testing data
test_data['Alone']=np.where((test_data['SibSp']+test_data['Parch']==0),1, 0)
# FamilySize: 1 + # of SibSp + # of ParCh
test_data['FamilySize'] = 1 + test_data['SibSp'] + test_data['Parch']

In [None]:
# More Visualization for feature engineering

# make subplots in 2x3 size, set figure size
fig, axx = plt.subplots(2, 3, figsize=(20,10))
sns.countplot(x='Survived', data=train_data, ax=axx[0,0]).set(title='Survivors')
sns.countplot(x='Survived', hue='Sex', data=train_data, ax=axx[0,1]).set(title='Survivors by Sex')
sns.countplot(x='Survived', hue='Pclass', data=train_data, ax=axx[0,2]).set(title='Survivors by Pclass')
sns.countplot(x='Survived', hue='Alone', data=train_data, ax=axx[1,0]).set(title='Accompanied survivors')
sns.countplot(x='FamilySize', hue='Survived', data=train_data, ax=axx[1,1]).set(title='Accompanied survivors')
sns.countplot(x='Pclass', hue='Alone', data=train_data, ax=axx[1,2]).set(title='Alone members by Pclass')
plt.tight_layout()

# Insights from the plots above:

## First plot:
1. More deceased than survived (50-60%)

## Second plot:
1. Male passengers more likely to die than survive
2. Female passengers more likely to survive than die

## Third plot:
1. Third passenger class way more likely to die than first and second

## Fourth plot:
1. Passengers who are alone are more likely to die

## Fifth plot:
1. likelihood to decease much higher when alone
2. likelihood to survive much higher when not alone

## Sixth plot
1. passengers in the third class are more likely to be alone

In [None]:
# pd.loc to access data in array based on given column(s)
# Find female survivors in training data
women = train_data.loc[train_data.Sex == 'female']["Survived"]
# Find the rate of survived women by adding the (0/1) then divide the amount of women
rate_women = sum(women)/len(women)

# Print the percentage of women
print("% of women who survived:", rate_women)

In [None]:
# pd.loc to access data in array based on given column(s)
# Find male survivors in training data
men = train_data.loc[train_data.Sex == 'male']["Survived"]
# Find the rate of survived women by adding the (0/1) then divide the amount of women
rate_men = sum(men)/len(men)

# Print the percentage of women
print("% of men who survived:", rate_men)

# Random Forest Classifier Parameters:

https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

Random ForestTree parameter (learned from sklearn):

n_estimators: number of trees in a forest

criterion: Function to measure split {“gini”, “entropy”, “log_loss”}

max_depth: tree max depth

min_samples_split: min # of samples to split, usually 2

min_samples_leaf: min # of samples to be a leaf, usually 1

min_weight_fraction_leaf: min weight fraction of the sum of all weights to be a leaf node, usually 0.0

max_feature: # of features to consider when look at best split, usually int, RandomState instance or None, default=None

max_leaf_nodes: usually int or default None aka unlimited

min_impurity_decrease: node splits if this split induce an impunity larger than given value

bootstrap: whether bootstrap samples used in tree (Boolean), defualt True

oob_score: whether use out of bag(oob) samples to estiamte generalization score (Boolearn, defualt=False)

n_jobs: # of jobs run in parallel

random_state: randomness of the estimator, usually nt, RandomState instance or None, default=None, used because when max_features<=nfeatures algo always select max_features

verbose: controls the verbosity when fitting and predicting

warm_start: when set True, resue solution from last call to fit and add more estimator, ELSE start a new tree

class_weight: weighting of classes, usually in dict, list of dict or “balanced”, default=None

ccp_alpha: complexity parameter used for cost-complexity pruning (CCP), default=0.0

max_samples: if bootstrap set True, the # of samples to draw from X to train each base estimator

In [None]:
# # Implementing Random Forest Model
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.metrics import accuracy_score
# # assign y to the survived passengers in training data
# y = train_data["Survived"]

# # Build trees based on the features selected
# # features = ["Pclass", "Sex", "SibSp", "Parch", "Fare"]
# features = ["Sex", "Fare"]
# # Assign X/X_test to the numerical variables from the training/testing data
# # Use pd.get_dummies to convert categorical variable to a numerical one
# X = pd.get_dummies(train_data[features])

# X_test = pd.get_dummies(test_data[features])
# X_test['Fare'] = X_test['Fare'].fillna(0)


# # Implement random forest classifier with argument
# # https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
# # Original model: 0.785
# # model = RandomForestClassifier(n_estimators=1000, max_depth=10, random_state=1) 

# # Hyperparamter Tuning: 
# # Optimized Model 1: gini criterion, 1000 DEPTH, rest untouched: ~0.76-0.77
# # Optimized Model 2: gini criterion, 10000 DEPTH, rest untouched: ~0.77
# # Optimized Model 3: gini criterion, 10 DEPTH, n_estimators=10000, rest untouched: ~0.78
# # Optimized Model 4: gini criterion, 1000 DEPTH, n_estimators=10000, rest untouched: ~0.78
# # Optimized Model 5: gini criterion, n_estimators=1000000, rest untouched: ~0.78
# # Optimized Model 6: gini criterion, n_estimators=1000, rest untouched: ~0.76, class_weight adjusted in which survivor is weighted heavier
# # Optimized Model 7: gini criterion, n_estimators=1000, rest untouched: ~0.76, class_weight adjusted in balance
# # Optimized Model 8: featues become only Sex and Fare, rest untouched: ~0.76


# # dict_weights = {1:2, 0: 1, 2:1, 3:1, 4:1}
# # dict_weights = {"Pclass":1, "Sex": 1, "SibSp": 1, "Parch": 1, "Fare": 1}

# optimized_model = RandomForestClassifier(n_estimators=10000, criterion= "gini", max_features=None, bootstrap=True)

# # TRY to fit the model
# optimized_model.fit(X, y)
# # use model.predict to save the model's predictions
# predictions = optimized_model.predict(X_test)

# # Assign output (418x2) to a dataframe
# output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
# # Assign output to a csv file
# output.to_csv('submission.csv', index=False)
# print("Your submission was successfully saved!")

# Decision Tree parameter (learned from sklearn):

criterion: Function to measure split {“gini”, “entropy”, “log_loss”}

splitter: strat to split each node, best for best split; random for best random split {“best”, “random”}

max_depth: tree max dept, usually none

min_samples_split: min # of samples to split, usually 1

min_samples_leaf: min # of samples to be a leaf, usually 1

min_weight_fraction_leaf: min weight fraction of the sum of all weights to be a leaf node

max_feature: # of features to consider when look at best split, usually int, RandomState instance or None, default=None

random_state: randomness of the estimator, usually nt, RandomState instance or None, default=None, used because when max_features<=nfeatures algo always select max_features

max_leaf_nodes: usually int or default None aka unlimited

min_impurity_decrease: node splits if this split induce an impunity larger than given value

class_weight: weighting of classes, usually in dict, list of dict or “balanced”, default=None

ccp_alpha: complexity parameter used for cost-complexity pruning (CCP), default=0.0