## Description: This program predicts if a passanger will survive on the titanic.

In [None]:
# Import Libraries

import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, StandardScaler 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix

In [None]:
# Load the data

titanic = sns.load_dataset('titanic')

In [None]:
# Print the first 5 rows the data

titanic.head()

In [None]:
# Count the number of rows and columns in the dataset

titanic.shape

In [None]:
# Get some descriptive statistics

titanic.describe()

In [None]:
# Get a count of the number of survivors

titanic['survived'].value_counts()

In [None]:
# Visualize the count of survivors

sns.countplot(titanic['survived']);

In [None]:
# Visualize the count of surivors for columns 'who', 'sex', 'pclass', 'sibsp', 'parch', 'embarked'

cols = ['who', 'sex', 'pclass', 'sibsp', 'parch', 'embarked']

n_rows = 2
n_cols = 3

# The subplot grid and figure size of each graph

fig, axs = plt.subplots(n_rows, n_cols, figsize = (n_cols * 3.2, n_rows * 3.2))

for r in range(0, n_rows):
    for c in range(0, n_cols):
        i = r* n_cols + c # Index to go through the number of columns
        
        ax=axs[r][c] # Show where the position each subplot
        sns.countplot(titanic[cols[i]], hue = titanic['survived'], ax=ax)
        ax.set_title(cols[i])
        ax.legend(title='Survived', loc = 'upper right')
plt.tight_layout()

In [None]:
# Look at survival rate by sex

titanic.groupby('sex')[['survived']].mean()

In [None]:
# Look at survival rate by sex and class

titanic.pivot_table('survived', index = 'sex', columns = 'class')

In [None]:
# Look at survival rate by sex and class visually

titanic.pivot_table('survived', index = 'sex', columns = 'class').plot(kind='line');

In [None]:
# Plot the survival rate of each class

sns.barplot(x='class', y= 'survived', data= titanic);

In [None]:
# Look at survival rate by sex, age and class

age = pd.cut(titanic['age'], [0,18,80])

titanic.pivot_table('survived', ['sex', age], 'class')

In [None]:
# Plot the prices paid of each class

plt.scatter(titanic['fare'], titanic['class'], color='purple', label = 'Passanger Paid')
plt.ylabel('Class')
plt.xlabel('Price/Class')
plt.title('Price of Each Class')
plt.legend()
plt.show();

In [None]:
# Count the empty values in each column

titanic.isna().sum()

In [None]:
# Look at all of the values in each column & get a count

for val in titanic:
    print(titanic[val].value_counts())
    print()

In [None]:
# Drop the columns

titanic.drop(columns = ['deck', 'embark_town', 'alive', 'class', 'who', 'alone', 'adult_male'], inplace=True)

# Remove the rows with missing values
titanic.dropna(subset = ['embarked', 'age'], inplace=True)

In [None]:
# Count the NEW number of rows and columns in the dataset

titanic.shape

In [None]:
# Look at the data types

titanic.dtypes

In [None]:
# Print the unique values in the columns

print(titanic['sex'].unique())
print(titanic['embarked'].unique())

In [None]:
# Encode the sex column

labelEncoder = LabelEncoder()
titanic.iloc[:,2] = labelEncoder.fit_transform(titanic.iloc[:,2].values)

# Encode the embarked column

titanic.iloc[:,7] = labelEncoder.fit_transform(titanic.iloc[:,7].values)

In [None]:
# Print the unique values in the columns

print(titanic['sex'].unique())
print(titanic['embarked'].unique())

In [None]:
titanic.dtypes

In [None]:
# Split the data into dependent 'X' and independent 'y' variables

target = 'survived'

X= titanic.drop(columns = target)
y= titanic[target]

In [None]:
# Scale the data

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

In [None]:
# Split the dataset into train and test data

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state= 0)

In [None]:
# Create a function with many machine learning models

def models(X_train,y_train):
    
    #Use Logistic Regression
    log = LogisticRegression(random_state=0)
    log.fit(X_train,y_train)
    
    #Use KNeighbors
    knn = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2)
    knn.fit(X_train,y_train)
    
    #Use SVC (Linear Kernel)
    
    svc_lin = SVC(kernel = 'linear', random_state=0)
    svc_lin.fit(X_train, y_train)
    
    #Use SVC (RBF Kernel)
    
    svc_rbf = SVC(kernel = 'rbf', random_state=0)
    svc_rbf.fit(X_train, y_train)
    
    #Use GaussianNB
    
    gauss = GaussianNB()
    gauss.fit(X_train, y_train)
    
    #Use Decision Tree Classifier
    
    tree = DecisionTreeClassifier(criterion='entropy', random_state=0)
    tree.fit(X_train,y_train)
    
    # Use Random Forest Classifier
    
    forest = RandomForestClassifier(n_estimators=10, criterion='entropy', random_state=0)
    forest.fit(X_train,y_train)
    
    #Print the training accuracy for each model
    
    print('[0]Logistic Regression Training Accuracy: ', log.score(X_train,y_train))
    print('[1]KNeighbors Training Accuracy: ', knn.score(X_train,y_train))
    print('[2]SVC Linear Training Accuracy: ', svc_lin.score(X_train,y_train))
    print('[3]SVC RBF Training Accuracy: ', svc_rbf.score(X_train,y_train))
    print('[4]Gauss NB Training Accuracy: ', gauss.score(X_train,y_train))
    print('[5]Decision Tree Training Accuracy: ', tree.score(X_train,y_train))
    print('[6]Random Forest Training Accuracy: ', forest.score(X_train,y_train))
    
    return log, knn, svc_lin, svc_rbf, gauss, tree, forest
    

In [None]:
#Get and train all of the models

model = models(X_train,y_train)

In [None]:
#Show the confusion matrix and accuracy for all of the models on the test data

for i in range(len(model)):
    cm = confusion_matrix(y_test, model[i].predict(X_test))
    
    #Extract TN, FP, FN, TP
    TN, FP, FN, TP = confusion_matrix(y_test, model[i].predict(X_test)).ravel()
    
    test_score = (TN + TP)/(TN+TP+FN+FP)
    
    print(cm)
    print('Model[{}] Testing Accuracy = "{}"'.format(i,test_score))
    print()

In [None]:
#Get feature importance

forest = model[6]
importances = pd.DataFrame({'feature': titanic.iloc[:,1:8].columns, 'importance' : np.round(forest.feature_importances_, 3)})
importances = importances.sort_values('importance', ascending= False).set_index('feature')
importances

In [None]:
#Visualize the importances

importances.plot.bar();

In [None]:
#Print the prediction of the random forest classifier

pred= model[6].predict(X_test)
print(pred)
print()

#Print the actual values

print(y_test)


In [None]:
#My survival
X_1 = np.array([2, 4, 89,10, 10, 0, 0])

#Scaling my survival
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
my_survival = sc.fit_transform(X_1[:, np.newaxis])

my_survival = my_survival.transpose()
print(my_survival)
#my_survival = my_survival.transpose()

print(my_survival)

#Print prediction of my survival using Random Forest

pred = model[5].predict(my_survival)
print(pred)