# Titanic Data Analysis
<h3>
    Josh Comstock
    10/25/2020
</h3>

In [6]:
import pandas as pd
import numpy as np
from math import floor
from sklearn import svm

In [7]:
training_df = pd.read_csv('train.csv')
testing_df = pd.read_csv('test.csv')

train_df = training_df[0:floor(len(training_df)*.85)].copy()  # Training on subsets of the data
test_df = training_df[floor(len(training_df)*.85):].copy()

FileNotFoundError: [Errno 2] No such file or directory: 'train.csv'

In [None]:
train_df.shape

In [None]:
train_df.columns

In [None]:
train_df.head()

In [None]:
train_df['Fare'] = train_df['Fare'].fillna(0)
test_df['Fare'] = test_df['Fare'].fillna(0)

In [None]:
import math
train_df['Fare_log'] = [math.log(x) if x > 0 else 0 for x in train_df['Fare']]
test_df['Fare_log'] = [math.log(x) if x > 0 else 0 for x in test_df['Fare']]


train_df['Fare_log'].plot(kind='hist', bins=50)

In [None]:
train_df['Cabin_Bins'] = [str(x)[0].upper() for x in train_df['Cabin']]
test_df['Cabin_Bins'] = [str(x)[0].upper() for x in test_df['Cabin']]

In [None]:
train_df['Age'] = train_df['Age'].fillna(25)
test_df['Age'] = test_df['Age'].fillna(25)

In [None]:
import statistics as stats
stats.mean(train_df['Age'])

In [None]:
train_df['Cabin_Bins'].unique()

In [None]:
def null_checker(df):
    lst = []
    df.apply(lambda x: lst.append(x.name) if x.isna().any() else None)
    return lst

In [None]:
x_candidates = pd.get_dummies(train_df[['Pclass', 'SibSp', 'Parch', 'Sex', 'Embarked', 'Cabin_Bins', 'Fare_log', 'Fare', 'Age']])
y_candidates = pd.get_dummies(test_df[['Pclass', 'SibSp', 'Parch', 'Sex', 'Embarked', 'Cabin_Bins', 'Fare_log', 'Fare', 'Age']])

In [None]:
x_candidates

In [None]:
oh_features = list(set(x_candidates.columns).intersection(set(y_candidates.columns)))
train_mtx = x_candidates[oh_features]
test_mtx = y_candidates[oh_features]

In [None]:
oh_features

In [None]:
null_checker(train_mtx)

In [5]:
train_mtx.head()

NameError: name 'train_mtx' is not defined

## Select best features with Chi2

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

X = train_mtx #independent columns
y = train_df[['Survived']]    #target column i.e price range#apply SelectKBest class to extract top 10 best features
bestfeatures = SelectKBest(score_func=chi2, k=10)
fit = bestfeatures.fit(X,y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)

In [None]:
#concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']  #naming the dataframe columns
print(featureScores.nlargest(50,'Score'))  #print 10 best features

### Additional feature importance checks - Heatmap

In [None]:
import seaborn as sns
from matplotlib import pyplot as plt

x = train_mtx.copy()  #independent columns
x['Survived'] = train_df['Survived']


y = x['Survived']   #target column i.e price range
#get correlations of each features in dataset
corrmat = x.corr()
top_corr_features = corrmat.index
plt.figure(figsize=(20,20))
#plot heat map
g=sns.heatmap(x[top_corr_features].corr(),annot=True,cmap="RdYlGn")

In [None]:
refined_train = train_mtx[['Age', 'Fare', 'Fare_log', 'Cabin_Bins_N', 'Sex_male', 'Sex_female',  'Pclass', 'Embarked_C', 'Cabin_Bins_B', 'Cabin_Bins_D']]
refined_test = test_mtx[['Age', 'Fare', 'Fare_log', 'Cabin_Bins_N', 'Sex_male', 'Sex_female',  'Pclass', 'Embarked_C', 'Cabin_Bins_B', 'Cabin_Bins_D']]

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

clf = make_pipeline(StandardScaler(), svm.SVC(gamma='auto'))

In [None]:
x = refined_train
y = train_df['Survived']
clf.fit(x, y)

In [None]:
svm_predictions = clf.predict(refined_test)

In [None]:
len(svm_predictions)

In [None]:
svm_predictions

In [None]:
test_df = test_df.reset_index()
test_df['svm_predictions'] = pd.Series(svm_predictions)

In [None]:
train_df['Survived'].value_counts()

In [None]:
260/(408+260)

In [None]:
test_df['Survived'].value_counts()

In [None]:
82/(82+141)

In [None]:
test_df[['PassengerId', 'Survived', 'svm_predictions']]

In [None]:
from sklearn.metrics import confusion_matrix
y_true = test_df['Survived']
y_pred = test_df['svm_predictions']

cf_matrix = confusion_matrix(y_true, y_pred)

In [None]:
group_names = ['True Neg','False Pos','False Neg','True Pos']
group_counts = ["{0:0.0f}".format(value) for value in
                cf_matrix.flatten()]
group_percentages = ["{0:.2%}".format(value) for value in
                     cf_matrix.flatten()/np.sum(cf_matrix)]
labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in
          zip(group_names,group_counts,group_percentages)]
labels = np.asarray(labels).reshape(2,2)
sns.heatmap(cf_matrix, annot=labels, fmt='', cmap='Blues')

<div>
<p>
    False negative appears negative but shouldn't be <br/>
    0 means died <br/>
    1 means survived <br/>
    False positive means we think they survived but they died <br/>
    <b>
        False negative means we think they died but they survived
    </b>
</p>
</div>

In [None]:
len(test_df[(test_df['Survived']==1) & (test_df['svm_predictions']==0)])
# We predicted 22 people would die that actually survived

### Analyzing our confusion matrix

In [None]:
test_df[(test_df['Survived']==1) & (test_df['svm_predictions']==0)]

In [None]:
# Based on the subset above, we can see that most of these false negatives are all males
# The model is biased too heavily towards thinking that men will die

# We will change the weights on the males and then run this again with a polynomial kernel

### Hyperparameter Tuning

In [None]:
# refined_train['Sex_female'] = [x*2 for x in refined_train['Sex_female']]

In [None]:
x = refined_train
y = train_df['Survived']

In [None]:
from sklearn.model_selection import GridSearchCV 

# defining parameter range 
param_grid = {'C': [100, 1000, 1250, 1500, 2000, 2500], 
    'gamma': [0.1, 0.01, 0.005, 0.0025, 0.00125, 0.001, 0.0001, 0.00005], 
    'kernel': ['linear']} 

# param_grid = {'C': [0.05, 0.1, .25, .5, .75, 1, 5, 10, 100, 1000], 
#     'gamma': [5000, 2500, 1750, 1500, 1250, 1000, 100, 50, 25, 10, 5, 1, .5, 0.1, 0.01, 0.025, 0.001, 0.005, 0.0001], 
#     'kernel': ['rbf', 'sigmoid']} 

grid = GridSearchCV(svm.SVC(), param_grid, refit = True, verbose = 3) 

# fitting the model for grid search 
grid.fit(x, y)

In [None]:
# print best parameter after tuning 
print(grid.best_params_) 

# print how our model looks after hyper-parameter tuning 
print(grid.best_estimator_)

In [None]:
# {'C': 1000, 'gamma': 0.0001, 'kernel': 'rbf'}
# SVC(C=1000, gamma=0.0001)

In [None]:
from sklearn.metrics import classification_report
grid_predictions = grid.predict(refined_test) 

# print classification report
print(classification_report(test_df['Survived'], grid_predictions))

In [None]:
clf = make_pipeline(StandardScaler(), svm.SVC(gamma=.001, C=1000, kernel='rbf'))
clf.fit(x, y)
svm_predictions = clf.predict(refined_test)

In [None]:
test_df = test_df.reset_index()
test_df['svm_predictions'] = pd.Series(svm_predictions)

In [None]:
y_true = test_df['Survived']
y_pred = test_df['svm_predictions']

cf_matrix = confusion_matrix(y_true, y_pred)

In [None]:
group_names = ['True Neg','False Pos','False Neg','True Pos']
group_counts = ["{0:0.0f}".format(value) for value in
                cf_matrix.flatten()]
group_percentages = ["{0:.2%}".format(value) for value in
                     cf_matrix.flatten()/np.sum(cf_matrix)]
labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in
          zip(group_names,group_counts,group_percentages)]
labels = np.asarray(labels).reshape(2,2)
sns.heatmap(cf_matrix, annot=labels, fmt='', cmap='Blues')

## Run the model with actual train and test data

In [None]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

train_df['Fare'] = train_df['Fare'].fillna(0)
test_df['Fare'] = test_df['Fare'].fillna(0)

train_df['Fare_log'] = [math.log(x) if x > 0 else 0 for x in train_df['Fare']]
test_df['Fare_log'] = [math.log(x) if x > 0 else 0 for x in test_df['Fare']]

train_df['Age'] = train_df['Age'].fillna(25)
test_df['Age'] = test_df['Age'].fillna(25)

train_df['Cabin_Bins'] = [str(x)[0].upper() for x in train_df['Cabin']]
test_df['Cabin_Bins'] = [str(x)[0].upper() for x in test_df['Cabin']]

x_candidates = pd.get_dummies(train_df[['Pclass', 'SibSp', 'Parch', 'Sex', 'Embarked', 'Cabin_Bins', 'Fare_log', 'Fare', 'Age']])
y_candidates = pd.get_dummies(test_df[['Pclass', 'SibSp', 'Parch', 'Sex', 'Embarked', 'Cabin_Bins', 'Fare_log', 'Fare', 'Age']])

oh_features = list(set(x_candidates.columns).intersection(set(y_candidates.columns)))
train_mtx = x_candidates[oh_features]
test_mtx = y_candidates[oh_features]

refined_train = train_mtx[['Fare', 'Fare_log', 'Cabin_Bins_N', 'Sex_male', 'Sex_female',  'Pclass']]
refined_test = test_mtx[['Fare', 'Fare_log', 'Cabin_Bins_N', 'Sex_male', 'Sex_female',  'Pclass']]

x = refined_train
y = train_df['Survived']

clf = make_pipeline(StandardScaler(), svm.SVC(gamma=1, C=1000, kernel='rbf'))
clf.fit(x, y)
svm_predictions = clf.predict(refined_test)

test_df = test_df.reset_index()
test_df['svm_predictions'] = pd.Series(svm_predictions)

test_df.rename(columns={'svm_predictions':'Survived'}, inplace=True)
# test_df[['PassengerId', 'Survived']].to_csv('svm_submissions.csv', index=False)