In [177]:
#Import libraries
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
from statsmodels.stats.outliers_influence import variance_inflation_factor as vif
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

%matplotlib inline

In [178]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [179]:
import pandas as pd

#Read input data
train_data = pd.read_csv('/content/drive/MyDrive/ML/titanic-ml-project/data/processed/train.csv')
test_data = pd.read_csv('/content/drive/MyDrive/ML/titanic-ml-project/data/processed/test.csv')


In [180]:
# Define features and target variable
X_train = train_data.drop('Survived', axis=1)
Y_train = train_data['Survived']

X_test = test_data.drop('Survived', axis=1)
Y_test = test_data['Survived']

## HistGradientBoostingClassifier

In [189]:
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split


# Define the hyperparameter search space
param_grid = {
    'learning_rate': [0.01, 0.1, 0.5, 1],
    'max_iter': [100, 500, 1000],
    'max_depth': [3, 5, 7],
    'min_samples_leaf': [1, 5, 10],
    'max_bins': [16, 32, 64],
    'early_stopping': [True, False]
}

# Define the model and the cross-validation strategy
model = HistGradientBoostingClassifier(random_state=42)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Perform randomized search with cross-validation
random_search = RandomizedSearchCV(model, param_grid, cv=cv, n_iter=10, random_state=42)
random_search.fit(X_train, Y_train)

# Print the best parameters and the best score
print("Best parameters:", random_search.best_params_)
print("Best score:", random_search.best_score_)

# Evaluate the best model on the test set
best_model = random_search.best_estimator_
print("Test set score:", best_model.score(X_test, Y_test))


Best parameters: {'min_samples_leaf': 5, 'max_iter': 1000, 'max_depth': 3, 'max_bins': 32, 'learning_rate': 1, 'early_stopping': True}
Best score: 0.8327349193396524
Test set score: 0.8899521531100478


In [190]:
from sklearn.ensemble import HistGradientBoostingClassifier
import pandas as pd
from sklearn.metrics import confusion_matrix, classification_report

# Initialize and fit the model with the best parameters
model = HistGradientBoostingClassifier(
    min_samples_leaf=5,
    max_iter=1000,
    max_depth=3,
    max_bins=32,
    learning_rate=1,
    early_stopping=True
)
model.fit(X_train, Y_train)

# Make predictions
Y_pred = model.predict(X_test)

# Create a DataFrame to compare actual and predicted values
diff_df = pd.DataFrame({'Actual': Y_test, 'Predicted': Y_pred})
print(diff_df)

# Compute the confusion matrix
conf_matrix = confusion_matrix(Y_test, Y_pred)
print(conf_matrix)

# Compute the classification report
target_names = ['Didnt Survived', 'Survived']
print(classification_report(Y_test, Y_pred, target_names=target_names))


     Actual  Predicted
0         0          0
1         1          0
2         0          0
3         0          0
4         1          1
..      ...        ...
413       0          0
414       1          1
415       0          0
416       0          0
417       0          0

[418 rows x 2 columns]
[[237  29]
 [ 27 125]]
                precision    recall  f1-score   support

Didnt Survived       0.90      0.89      0.89       266
      Survived       0.81      0.82      0.82       152

      accuracy                           0.87       418
     macro avg       0.85      0.86      0.86       418
  weighted avg       0.87      0.87      0.87       418



 **Observations:**<br>
 **True Negatives (TN) = 237**<br>
The model correctly predicted 237 people as "Didnt Survived."<br>
 **False Positives (FP) = 29**<br>
The model incorrectly predicted 29 people as Survived when they actually didn't (Type I error).<br>
 **False Negatives (FN) = 27**<br>
The model incorrectly predicted 27 people as "Didnt Survived" when they actually did (Type II error).<br>
 **True Positives (TP) = 125**<br>
The model correctly predicted 125 people as Survived.<br>

## Decision Tree Classifier

In [185]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from scipy.stats import randint

# Define the parameter distribution for hyperparameter tuning
param_dist = {
    'max_depth': randint(1, 21),  # Randomly sample depths from 1 to 20
    'min_samples_split': [2, 5, 10],  # Example values for min_samples_split
    'min_samples_leaf': [1, 2, 4],  # Example values for min_samples_leaf
    'max_features': ['auto', 'sqrt', 'log2']  # Example values for max_features
}

# Initialize the Decision Tree Classifier
dt = DecisionTreeClassifier(criterion='entropy', random_state=42)

# Use RandomizedSearchCV for hyperparameter tuning with cross-validation
random_search = RandomizedSearchCV(estimator=dt, param_distributions=param_dist, n_iter=100, cv=5, scoring='accuracy', random_state=42)
random_search.fit(X_train, Y_train)

# Best parameters
print("Best parameters:", random_search.best_params_)

# Make predictions on the test set using the best estimator
Y_dt_opt_pred = random_search.best_estimator_.predict(X_test)

# Define target names for the classification report
target_names = ['Did not Survive', 'Survived']

# Generate and print the classification report
print(classification_report(Y_test, Y_dt_opt_pred, target_names=target_names))

# Evaluate the model's accuracy
accuracy = random_search.best_estimator_.score(X_test, Y_test)
print("Test accuracy:", accuracy)


Best parameters: {'max_depth': 13, 'max_features': 'log2', 'min_samples_leaf': 4, 'min_samples_split': 10}
                 precision    recall  f1-score   support

Did not Survive       0.89      0.95      0.92       266
       Survived       0.90      0.80      0.85       152

       accuracy                           0.89       418
      macro avg       0.90      0.87      0.88       418
   weighted avg       0.90      0.89      0.89       418

Test accuracy: 0.8947368421052632


175 fits failed out of a total of 500.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
175 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.12/dist-packages/sklearn/base.py", line 1382, in wrapper
    estimator._validate_params()
  File "/usr/local/lib/python3.12/dist-packages/sklearn/base.py", line 436, in _validate_params
    validate_parameter_constraints(
  File "/usr/local/lib/python3.12/dist-packages/sklearn/utils/_param_validation.py", line 98, in validate_parameter_constraints
    raise InvalidParameterError(
sk

In [186]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

# Load dataset
data = load_iris()
X, y = data.data, data.target

# Initialize the Decision Tree Classifier with optimized parameters
dt = DecisionTreeClassifier(max_depth=13, min_samples_split=10, min_samples_leaf=4, max_features='log2', random_state=42)

# Train the model
dt.fit(X_train, Y_train)

# Make predictions on the test set
Y_dt_opt_pred = dt.predict(X_test)

target_names = ['Didnt Survived', 'Survived']
print(classification_report(Y_test, Y_dt_opt_pred, target_names=target_names))

# Evaluate the model
accuracy = dt.score(X_test, Y_test)
print("Test accuracy:", accuracy)

                precision    recall  f1-score   support

Didnt Survived       0.93      0.89      0.91       266
      Survived       0.83      0.89      0.86       152

      accuracy                           0.89       418
     macro avg       0.88      0.89      0.89       418
  weighted avg       0.90      0.89      0.89       418

Test accuracy: 0.8923444976076556


## Random Forests

## XGBoost