In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D


df = pd.read_csv("titanic_data.csv")
#reader

In [None]:
from sklearn.preprocessing import OneHotEncoder
Sex = df['Sex']
encoder = OneHotEncoder(sparse_output=False)  # Ensure sparse is set to False
Sex_encoded = encoder.fit_transform(Sex.values.reshape(-1, 1))
Sex_encoded_df = pd.DataFrame(Sex_encoded, columns=encoder.get_feature_names_out(['Sex']))
# Drop the original 'Sex' column and concatenate the new columns
df = df.drop(columns='Sex')
df = pd.concat([df, Sex_encoded_df], axis=1)
#done to encode sex as float value

In [None]:
df = df.dropna()
#gets rid of missing/Na values

In [None]:

X = df.drop(columns=['PassengerId','Survived', 'Name', 'Ticket', 'Cabin', 'Embarked'])
#gets rid of any values that aren't pertinent
y = df['Survived']

In [None]:

#train the data
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit

strat_shuff_split = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)

for train_index, test_index in strat_shuff_split.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index] 
    

In [None]:
from sklearn.metrics import roc_curve
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
#Modifed from stepbystep project(referenced https://github.com/GabeALopez/)

#function to fit and print model
def fit_and_print(myPipeline, myXTrain, myYTrain, myXTest, myYTest):
  myPipeline.fit(myXTrain, myYTrain)
  test_preds = myPipeline.predict(myXTest)

  print("Accuracy: " + str(accuracy_score(test_preds, myYTest)))
  print("Precision Score: " + str(precision_score(test_preds, myYTest)))
  print("F1 Score: " + str(f1_score(test_preds, myYTest)))
  print("Recall Score: " + str(recall_score(test_preds, myYTest)))


In [None]:
#Preparing the Logistic Regression model
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
#For later use
from sklearn.model_selection import GridSearchCV


#Preparing, using, and print out graphs and metrics of model
log_clf = LogisticRegression(random_state=42, solver='lbfgs')

preprocess = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', log_clf)
])

print("Logistic Regression:")

fit_and_print(preprocess, X_train, y_train, X_test, y_test)
#print the results

In [45]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier

rnd_clf = RandomForestClassifier(n_estimators=100, random_state=42)

#Preparing, using, and print out graphs and metrics of model

print("Random Forest Classifier:")
fit_and_print(rnd_clf, X_train, y_train, X_test, y_test)

prob = rnd_clf.predict_proba(X_test)[:,1]


Random Forest Classifier:
Accuracy: 0.6486486486486487
Precision Score: 0.76
F1 Score: 0.7450980392156863
Recall Score: 0.7307692307692307


In [49]:
#Modified from stepbystep(referenced https://github.com/GabeALopez/)
from sklearn.model_selection import GridSearchCV

my_list = [n for n in range(10,100) if n % 10 == 0]
#use of list comprehension to create a range for n_estimators


grid_search_for = {
    'n_estimators' : my_list,

}

grid_search_for_var = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=grid_search_for,
    cv=5,  # Number of cross-validation folds
    n_jobs=-1,  # Use all available cores
    verbose=1  # Verbosity level
)

fit_and_print(grid_search_for_var, X_train, y_train, X_test, y_test)

#grid search for random forest


Fitting 5 folds for each of 9 candidates, totalling 45 fits
Accuracy: 0.6756756756756757
Precision Score: 0.76
F1 Score: 0.76
Recall Score: 0.76


In [51]:
print("Support Vector Classifier:")

#Preparing, using, and print out graphs and metrics of model

svc_clf = SVC(gamma='scale', random_state=42)

fit_and_print(svc_clf, X_train, y_train, X_test, y_test)

prob = svc_clf.decision_function(X_test)

Support Vector Classifier:
Accuracy: 0.6756756756756757
Precision Score: 1.0
F1 Score: 0.8064516129032258
Recall Score: 0.6756756756756757


In [50]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

#Preparing, using, and print out graphs and metrics of model

ada_clf = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=1), n_estimators=50, learning_rate=0.5, algorithm='SAMME', random_state=42
)

fit_and_print(ada_clf, X_train, y_train, X_test, y_test)

prob = ada_clf.decision_function(X_test)

Accuracy: 0.8108108108108109
Precision Score: 0.88
F1 Score: 0.8627450980392157
Recall Score: 0.8461538461538461


In [32]:
#Modified from stepbystep(referenced https://github.com/GabeALopez/)
from sklearn.model_selection import GridSearchCV

my_list = [n for n in range(10,100) if n % 10 == 0]

grid_search_ada = {
    'n_estimators' : my_list,
    'learning_rate' : [0.1,0.2,0.3, 0.4,0.5, 0.6,0.7,0.8,0.9,1]
}

grid_search_ada_var = GridSearchCV(AdaBoostClassifier(algorithm='SAMME'), grid_search_ada, cv=5, n_jobs=1)

fit_and_print(grid_search_ada_var, X_train, y_train, X_test, y_test)

prob = grid_search_ada_var.decision_function(X_test)

#grid search for the ada boost estimator

Accuracy: 0.8108108108108109
Precision Score: 0.88
F1 Score: 0.8627450980392157
Recall Score: 0.8461538461538461


In [35]:

from sklearn.ensemble import BaggingClassifier

print("Bagging Classifier:")

#Preparing, using, and print out graphs and metrics of model

b_clf = BaggingClassifier(
    estimator=DecisionTreeClassifier(max_depth=1), n_estimators=50, random_state=42
)
fit_and_print(b_clf, X_train, y_train, X_test, y_test)


Bagging Classifier:
Accuracy: 0.7027027027027027
Precision Score: 0.64
F1 Score: 0.7441860465116279
Recall Score: 0.8888888888888888


In [38]:

from sklearn.ensemble import VotingClassifier

print("Vote Classifier:")

#Preparing, using, and print out graphs and metrics of model
clf1 = DecisionTreeClassifier(max_depth=1)
clf2 = DecisionTreeClassifier(max_depth=3)


v_clf = VotingClassifier(estimators=[
    ('dt1', clf1),
    ('dt2', clf2),
    # Add more classifiers if you want
], voting='soft')  # or voting='hard'

fit_and_print(v_clf, X_train, y_train, X_test, y_test)


Vote Classifier:
Accuracy: 0.7297297297297297
Precision Score: 0.68
F1 Score: 0.7727272727272727
Recall Score: 0.8947368421052632


In [None]:
from sklearn.ensemble import GradientBoostingClassifier

print("Gradient Boosting Algorithm:")

#Preparing, using, and print out graphs and metrics of model

gb_clf = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)

fit_and_print(gb_clf, X_train, y_train, X_test, y_test)

prob = svc_clf.decision_function(X_test)

In [40]:
import xgboost as xgb

print("Extreme Gradient Boosting Algorithm:")

#Preparing, using, and print out graphs and metrics of model

egb_clf = xgb.XGBClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=3,
    random_state=42
)

fit_and_print(egb_clf, X_train, y_train, X_test, y_test)


ExGradient Boosting Algorithm:
Accuracy: 0.7297297297297297
Precision Score: 0.8
F1 Score: 0.8
Recall Score: 0.8


In [44]:
#Modified from stepbystep(referenced https://github.com/GabeALopez/)
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier

my_list = [n for n in range(10,1000) if n % 100 == 0]

grid_search_params = {
    'n_estimators' : my_list,
    'learning_rate' : [0.2, 0.4, 0.6]

}

xgb_clf = XGBClassifier(
    max_depth=3,
    random_state=42
)

grid_search_xgb_var = GridSearchCV(
    estimator=xgb_clf,
    param_grid=grid_search_params,
    scoring='accuracy',  # or any other scoring metric you prefer
    cv=5,  # Number of cross-validation folds
    n_jobs=-1  # Use all available cores
)

fit_and_print(grid_search_xgb_var, X_train, y_train, X_test, y_test)

Accuracy: 0.6756756756756757
Precision Score: 0.8
F1 Score: 0.7692307692307693
Recall Score: 0.7407407407407407


In [53]:
#Modified from stepbystep(referenced https://github.com/GabeALopez/)
from sklearn.model_selection import GridSearchCV

my_list = [n for n in range(10,100) if n % 10 == 0]


grid_search_ada = {
    'n_estimators' : my_list,
    'learning_rate' : [0.2, 0.4, 0.6]

}

grid_search_gf_var = GridSearchCV(
    estimator=GradientBoostingClassifier(max_depth=3, random_state=42),
    param_grid=grid_search_ada, cv=5, scoring = 'accuracy'
)

fit_and_print(grid_search_gf_var, X_train, y_train, X_test, y_test)

prob = grid_search_gf_var.decision_function(X_test)


Accuracy: 0.7297297297297297
Precision Score: 0.8
F1 Score: 0.8
Recall Score: 0.8
