# Logistic Regression: Banking Marketing Campaign

This notebook analyzes a banking marketing campaign dataset to predict whether customers will subscribe to a term deposit. We'll build and optimize a logistic regression model to classify customer responses.

In [None]:
# Handle imports upfront
import pickle
import random
import warnings
from pathlib import Path

import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.exceptions import ConvergenceWarning

warnings.filterwarnings("ignore", category=ConvergenceWarning)

## 1. Data acquisition

We'll start by loading the dataset from the provided URL and saving a local copy for future use.

### 1.1. Load the dataset

In [None]:
# Load the dataset from the provided URL
data_url = 'https://raw.githubusercontent.com/4GeeksAcademy/logistic-regression-project-tutorial/main/bank-marketing-campaign-data.csv'
data_df = pd.read_csv(data_url, sep=';')

### 1.2. Save local copy

In [None]:
# Make a directory for raw data
Path('../data/raw').mkdir(exist_ok=True, parents=True)

# Save a local copy of the raw data
data_df.to_parquet('../data/raw/bank-marketing-campaign-data.parquet')

### 1.3. Inspect

In [None]:
data_df.head().transpose()

In [None]:
data_df.info()

## 2. Data preparation

Before training our model, we need to prepare the data by splitting it into training and testing sets, and encoding categorical variables for use with scikit-learn.

### 2.1. Train-test split

In [None]:
# Split the data into training (75%) and testing (25%) sets
# This ensures we have unseen data to evaluate our final model

### 2.2. Feature encoding

Machine learning algorithms work with numerical data, so we need to convert categorical variables (strings) to numerical format using ordinal encoding.

In [None]:
# Categorical features with 'object' datatypes (string) which need encoding
categorical_features = ['y','job','education','marital','default','housing','loan','contact','poutcome','day_of_week','month']

# Instantiate a encoder

# Encode the categorical features in the training and testing datasets

# Inspect the result - there should be only float or int datatypes left


## 3. Model training

We'll establish baseline performance using simple models, then build and optimize a logistic regression classifier.

In [None]:
# Empty dictionary to store performance results
results = {}

### 3.1. Random model performance

A random classifier serves as our weakest baseline - any useful model should significantly outperform random guessing.

In [None]:
# Generate random predictions for the testing set
# This serves as a baseline to compare our model performance against

# Calculate accuracy of random model

# Store the accuracy in the results dictionary
results['Random'] = accuracy

print(f'Accuracy of random model: {accuracy:.2f}%')

### 3.2. Constant 'no' model performance

Since this is a classification problem with imbalanced classes, we should check how well a model that always predicts the majority class would perform.

In [None]:
# Calculate accuracy if we always predict 'no' (the majority class)

# Store the accuracy in the results dictionary
results['Constant No'] = accuracy

print(f'Accuracy of constant "no" model: {accuracy:.2f}%')

### 3.3. Logistic regression model performance

Now we'll train a basic logistic regression model with default parameters to see how much improvement we get over the baseline models.

In [None]:
# Train a basic logistic regression model with default parameters

# Make predictions on the testing set

# Calculate accuracy of the test set predictions

# Store the accuracy in the results dictionary
results['Regression'] = accuracy

print(f"Testing accuracy: {accuracy:.2f}%")

### 3.4. Optimized logistic regression model performance

To get the best performance, we'll use grid search with cross-validation to find the optimal hyperparameters for our logistic regression model.

In [None]:
# Define hyperparameters to search over
# These parameters can significantly affect model performance
hyperparameters = {
    'solver': ['newton-cg', 'lbfgs', 'liblinear'],  # Different optimization algorithms
    'fit_intercept': [True, False],                 # Whether to include intercept term
    'max_iter': [50, 100, 200, 400, 800]            # Maximum iterations for convergence
}

# Use grid search with cross-validation to find best hyperparameters
# This systematically tests all combinations to find the optimal settings


# Save the best model and parameter combination
winning_parameters = grid.best_params_
winning_model = grid.best_estimator_

print(f'Best hyperparameters: {winning_parameters}')

In [None]:
# Score the best model on the testing set

# Store the accuracy in the results dictionary
results['Optimized Regression'] = accuracy

print(f'Testing accuracy of optimized model: {accuracy:.2f}%')

### 3.5. Results

In [None]:
# Create a bar plot to compare model performance

## 4. Final model

With the best hyperparameters identified, we'll train our final model and evaluate its performance on the test set. The confusion matrix will help us understand how well the model performs for each class.

### 4.1. Model re-training

In [None]:
# Retrain model with winning hyperparameters on complete training set

# Calculate test set accuracy

print(f'Final model test set accuracy: {accuracy:.2f}%')

### 4.2. Model evaluation

In [None]:
# Generate and display normalized confusion matrix
cm = confusion_matrix(testing_df['y'], predictions, normalize='true')
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['No', 'Yes'])
disp.plot(cmap='Blues')
plt.title('Confusion Matrix - Test Set (Normalized)')
plt.show()

### 4.3. Save assets

#### 4.3.1. Data

In [None]:
# Make sure the processed data directory exists
Path('../data/processed').mkdir(exist_ok=True, parents=True)

# Collect the training and testing datasets into a dictionary
datasets = {
    'training': training_df,
    'testing': testing_df
}

# Save the datasets to a file for future use
with open('../data/processed/datasets.pkl', 'wb') as datasets_file:
    pickle.dump(datasets, datasets_file)

#### 4.3.2. Models

In [None]:
# Make sure the models directory exists
Path('../models/model.pkl').parent.mkdir(exist_ok=True, parents=True)

# Save the final model
with open('../models/model.pkl', 'wb') as output_file:
    pickle.dump(winning_model, output_file)