Tuning the hyperparameters for a logistic regression model

In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
from sklearn.model_selection import RandomizedSearchCV

train = pd.read_csv("C:\\Users\\joshn\\Documents\\Coding\\Spaceship Titanic\\Stacking\\Process\\CombinedTrain.csv")
test = pd.read_csv("C:\\Users\\joshn\\Documents\\Coding\\Spaceship Titanic\\Stacking\\Process\\CombinedTest.csv")

# Separate ID before training
train_passenger_ids = train['PassengerId']
test_passenger_ids = test['PassengerId']

# Prepare data
X_train = train.drop(['Transported', 'PassengerId'], axis=1)
y_train = train['Transported'].astype('int')  # Convert True/False to 1/0
X_test = test.drop('PassengerId', axis=1)

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
import pandas as pd

# Define the Logistic Regression model
lr_model = LogisticRegression(random_state=1)

# Define the hyperparameter grid
param_grid_lr = {
    'penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'max_iter': [100, 200, 300, 400, 500],
    'l1_ratio': [None, 0.25, 0.5, 0.75]  # Only used if the penalty is 'elasticnet'
}

# RandomizedSearchCV for hyperparameter tuning
random_search_lr = RandomizedSearchCV(lr_model, param_grid_lr, n_iter=200, cv=5, scoring='accuracy', verbose=1, random_state=1)
random_search_lr.fit(X_train, y_train)

# Best parameters
best_params_lr = random_search_lr.best_params_
print("Best Parameters CV: ", best_params_lr)

# Use best parameters to make model
model_lr = LogisticRegression(**best_params_lr, random_state=1)
model_lr.fit(X_train, y_train)

# Make predictions
y_test_pred_lr = model_lr.predict(X_test)

# Prepare submission
test_predictions_lr = pd.DataFrame({
    'PassengerId': test_passenger_ids,
    'Transported': y_test_pred_lr
})
test_predictions_lr['Transported'] = test_predictions_lr['Transported'].astype(bool)

# Export to CSV
test_predictions_lr.to_csv("C:\\Users\\joshn\\Documents\\Coding\\Spaceship Titanic\\Stacking\\Process\\Submission - Logistic Regression.csv", index=False)


Fitting 5 folds for each of 200 candidates, totalling 1000 fits
Best Parameters CV:  {'solver': 'saga', 'penalty': 'l1', 'max_iter': 300, 'l1_ratio': 0.75, 'C': 0.1}


Retraining with the best parameters in case I need to test the model again

In [3]:
#QUICK FIRE RETRAIN USING BEST PARAMETERS

import pandas as pd
from sklearn.linear_model import LogisticRegression

train = pd.read_csv("C:\\Users\\joshn\\Documents\\Coding\\Spaceship Titanic\\Stacking\\Process\\CombinedTrain.csv")
test = pd.read_csv("C:\\Users\\joshn\\Documents\\Coding\\Spaceship Titanic\\Stacking\\Process\\CombinedTest.csv")

# Separate ID before training
train_passenger_ids = train['PassengerId']
test_passenger_ids = test['PassengerId']

# Prepare data
X_train = train.drop(['Transported', 'PassengerId'], axis=1)
y_train = train['Transported'].astype('int')  # Convert True/False to 1/0
X_test = test.drop('PassengerId', axis=1)

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
import pandas as pd

# Use best parameters to make model
best_params_lr = {'solver': 'saga', 'penalty': 'l1', 'max_iter': 300, 'l1_ratio': 0.75, 'C': 0.1}
model_lr = LogisticRegression(**best_params_lr, random_state=1)
model_lr.fit(X_train, y_train)

# Make predictions
y_test_pred_lr = model_lr.predict(X_test)

# Prepare submission
test_predictions_lr = pd.DataFrame({
    'PassengerId': test_passenger_ids,
    'Transported': y_test_pred_lr
})
test_predictions_lr['Transported'] = test_predictions_lr['Transported'].astype(bool)

# Export to CSV
test_predictions_lr.to_csv("C:\\Users\\joshn\\Documents\\Coding\\Spaceship Titanic\\Stacking\\Test\\Submission - Logistic Regression.csv", index=False)
