In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D


df = pd.read_csv("titanic_data.csv")
#reader

In [None]:
from sklearn.preprocessing import OneHotEncoder
Sex = df['Sex']
encoder = OneHotEncoder(sparse_output=False)  # Ensure sparse is set to False
Sex_encoded = encoder.fit_transform(Sex.values.reshape(-1, 1))
Sex_encoded_df = pd.DataFrame(Sex_encoded, columns=encoder.get_feature_names_out(['Sex']))
# Drop the original 'Sex' column and concatenate the new columns
df = df.drop(columns='Sex')
df = pd.concat([df, Sex_encoded_df], axis=1)

In [None]:
df = df.dropna()

In [None]:

X = df.drop(columns=['PassengerId','Survived', 'Name', 'Ticket', 'Cabin', 'Embarked'])
y = df['Survived']

In [None]:

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit

strat_shuff_split = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)

for train_index, test_index in strat_shuff_split.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index] 
    

In [None]:
from sklearn.metrics import roc_curve
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
#Modifed from stepbystep project

#function to fit and print model
def fit_and_print(myPipeline, myXTrain, myYTrain, myXTest, myYTest):
  myPipeline.fit(myXTrain, myYTrain)
  test_preds = myPipeline.predict(myXTest)

  print("Accuracy: " + str(accuracy_score(test_preds, myYTest)))
  print("Precision Score: " + str(precision_score(test_preds, myYTest)))
  print("F1 Score: " + str(f1_score(test_preds, myYTest)))
  print("Recall Score: " + str(recall_score(test_preds, myYTest)))


In [None]:
#An imputer was not used here due to the data set not having any null values
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
#For later use
from sklearn.model_selection import GridSearchCV


#Preparing, using, and print out graphs and metrics of model
log_clf = LogisticRegression(random_state=42, solver='lbfgs')

preprocess = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', log_clf)
])

print("Logistic Regression:")

fit_and_print(preprocess, X_train, y_train, X_test, y_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier

rnd_clf = RandomForestClassifier(n_estimators=100, random_state=42)

#Preparing, using, and print out graphs and metrics of model

print("Random Forest Classifier:")
fit_and_print(rnd_clf, X_train, y_train, X_test, y_test)

prob = rnd_clf.predict_proba(X_test)[:,1]


In [None]:
print("Support Vector Classifier:")

#Preparing, using, and print out graphs and metrics of model

svc_clf = SVC(gamma='scale', random_state=42)

fit_and_print(svc_clf, X_train, y_train, X_test, y_test)

prob = svc_clf.decision_function(X_test)

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

#Preparing, using, and print out graphs and metrics of model

ada_clf = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=1), n_estimators=50, learning_rate=0.5, random_state=42
)

fit_and_print(ada_clf, X_train, y_train, X_test, y_test)

prob = ada_clf.decision_function(X_test)

In [56]:
#Modified from stepbystep
from sklearn.model_selection import GridSearchCV

#Preparing, using, and print out graphs and metrics of model
def rand():
    parameter = []
    for n in range(10,1000):
        if n % 100 == 0:
            parameter.append(n)
    return parameter

grid_search_ada = {
    'n_estimators' : rand(),
    'learning_rate' : [0.2, 0.4, 0.6]

}

grid_search_ada_var = GridSearchCV(AdaBoostClassifier(algorithm='SAMME'), grid_search_ada, cv=5, n_jobs=1)

fit_and_print(grid_search_ada_var, X_train, y_train, X_test, y_test)

prob = grid_search_ada_var.decision_function(X_test)


Accuracy: 0.8108108108108109
Precision Score: 0.88
F1 Score: 0.8627450980392157
Recall Score: 0.8461538461538461


In [43]:
from sklearn.ensemble import GradientBoostingClassifier

print("Gradient Boosting Algorithm:")

#Preparing, using, and print out graphs and metrics of model

# Initialize the grid search
gb_clf = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)

fit_and_print(gb_clf, X_train, y_train, X_test, y_test)

prob = svc_clf.decision_function(X_test)

Gradient Boosting Algorithm:
Accuracy: 0.7837837837837838
Precision Score: 0.8
F1 Score: 0.8333333333333334
Recall Score: 0.8695652173913043
