# Building the models
## Using the full Titanic dataset

In [2]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import f1_score

In [3]:
data1 = pd.read_csv(os.path.join('..','data','Variant 1','titanic_cleaned.csv'))
data2 = pd.read_csv(os.path.join('..','data','Variant 2','titanic_cleaned.csv'))
data3 = pd.read_csv(os.path.join('..','data','Variant 3','titanic_cleaned.csv'))
data = [data1, data2, data3]

In [4]:
scores = []
f1scores = []

In [5]:
for file in data:
    x = file.drop("Survived",axis=1)
    y = file["Survived"]
    xTrain, xVal, yTrain, yVal = train_test_split(x,y,test_size=0.2,random_state=42)
    
    #random forest
    random_forest = RandomForestClassifier().fit(xTrain,yTrain)
    preds = random_forest.predict(xVal)
    scores.append(accuracy_score(yVal,preds))
    f1scores.append(f1_score(yVal,preds))
    
    #support vector machine
    svm_clf = svm.SVC(kernel='linear').fit(xTrain,yTrain)
    preds = svm_clf.predict(xVal)
    scores.append(accuracy_score(yVal,preds))
    f1scores.append(f1_score(yVal,preds))
    
    #naive bayes
    naive_bayes = GaussianNB().fit(xTrain,yTrain)
    preds = naive_bayes.predict(xVal)
    scores.append(accuracy_score(yVal,preds))
    f1scores.append(f1_score(yVal,preds))
    
    #decision tree
    decision_tree = DecisionTreeClassifier().fit(xTrain,yTrain)
    preds = decision_tree.predict(xVal)
    scores.append(accuracy_score(yVal,preds))
    f1scores.append(f1_score(yVal,preds))
    
    #logistic regression
    logistic_reg = LogisticRegression(max_iter=500).fit(xTrain,yTrain)
    preds = logistic_reg.predict(xVal)
    scores.append(accuracy_score(yVal,preds))
    f1scores.append(f1_score(yVal,preds))
    
    print("done")

done
done
done


In [6]:
scores

[0.7786259541984732,
 0.7557251908396947,
 0.7442748091603053,
 0.7404580152671756,
 0.7786259541984732,
 0.8206106870229007,
 0.8282442748091603,
 0.816793893129771,
 0.7938931297709924,
 0.8396946564885496,
 0.7938931297709924,
 0.7977099236641222,
 0.7404580152671756,
 0.7709923664122137,
 0.8206106870229007]

In [7]:
l = []
for i in range(len(scores)):   
    temp = []
    if i < 5:
         temp.append(1)
    elif i > 4 and i < 10:
        temp.append(2)
    elif i > 9 and 1 < 15:
        temp.append(3)
        
    if i % 5 == 0:
        temp.append("Random Forest")
    elif i % 5 == 1:
        temp.append("SVM")
    elif i % 5 == 2:
        temp.append("Naive Bayes")
    elif i % 5 == 3:
        temp.append("Decision Tree")
    elif i % 5 == 4:
        temp.append("Logistic Regression")
        
    temp.append(scores[i])
    temp.append(f1scores[i])
    
    l.append(temp)
    
    

In [8]:
df = pd.DataFrame(l, columns=['Dataset', 'Model', 'Accuracy', 'F1 Score'])

In [9]:
df

Unnamed: 0,Dataset,Model,Accuracy,F1 Score
0,1,Random Forest,0.778626,0.718447
1,1,SVM,0.755725,0.700935
2,1,Naive Bayes,0.744275,0.676329
3,1,Decision Tree,0.740458,0.663366
4,1,Logistic Regression,0.778626,0.718447
5,2,Random Forest,0.820611,0.770732
6,2,SVM,0.828244,0.780488
7,2,Naive Bayes,0.816794,0.769231
8,2,Decision Tree,0.793893,0.740385
9,2,Logistic Regression,0.839695,0.79


In [10]:
df.to_csv(os.path.join('..','data','Scores','fullDatasetResults.csv'), index=False)