# Logistic Regression: Banking Marketing Campaign

## 1. Data loading

In [None]:
# Handle imports upfront
import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from scipy import stats
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

### 1.1. Load

In [None]:
data_url='https://raw.githubusercontent.com/4GeeksAcademy/logistic-regression-project-tutorial/main/bank-marketing-campaign-data.csv'
data_df=pd.read_csv(data_url, sep=';')

### 1.2. Inspect

In [None]:
data_df.head().transpose()

In [None]:
data_df.info()

### 1.3. Train-test split

In [None]:
# Do the test-train split
training_df, testing_df=train_test_split(
    data_df,
    test_size=0.25, 
    random_state=315
)

## 2. EDA

### 2.1. Baseline model performance

In [None]:
# How many yes/no labels do we have
training_df['y'].value_counts()

In [None]:
# Accuracy for constant 'no' model
accuracy=(training_df['y'].value_counts()['no']/len(training_df['y']))*100
print(f'Accuracy of constant "no" model: {accuracy:.1f}%')

In [None]:
# Get accuracy of random 50:50 guess
labels=['yes', 'no']
choices=random.choices(labels, k=len(training_df['y']))
accuracy=accuracy_score(training_df['y'], choices)
print(f'Accuracy of random guesses: {accuracy*100:.1f}%')

### 2.2. Data composition & cleaning

In [None]:
categorical_features=['y','job','education','marital','default','housing','loan','contact','poutcome','day_of_week','month']

In [None]:
fig, axs=plt.subplots(4,3, figsize=(12,10))
axs=axs.flatten()

fig.suptitle('Customer feature level counts')

for i, feature in enumerate(categorical_features):

    # Plot neighborhood group level counts
    level_counts=training_df[feature].value_counts()

    axs[i].set_title(feature)
    axs[i].bar(list(range(len(level_counts))), level_counts, tick_label=level_counts.index, color='black')
    axs[i].tick_params(axis='x', labelrotation=45)
    axs[i].set_ylabel('Customers')

fig.tight_layout()
fig.show()

In [None]:
numerical_features=['age','duration','campaign','pdays','previous','emp.var.rate','cons.price.idx','cons.conf.idx','euribor3m','nr.employed']

In [None]:
fig, axs=plt.subplots(4,3, figsize=(12,10))
axs=axs.flatten()

fig.suptitle('Customer feature distributions')

for i, feature in enumerate(numerical_features):

    axs[i].set_title(feature)
    axs[i].hist(training_df[feature], color='black')
    axs[i].tick_params(axis='x', labelrotation=45)
    axs[i].set_ylabel('Customers')

fig.tight_layout()
fig.show()

### 2.3. Feature interactions & selection

In [None]:
categorical_features=['job','marital','default','housing','loan','contact','poutcome','day_of_week','month']

In [None]:
fig, axs=plt.subplots(3,3, figsize=(12,10))
axs=axs.flatten()

fig.suptitle('Customer responses by feature level')

for i, feature in enumerate(categorical_features):
    groups=training_df.groupby([feature, 'y']).size()
    groups_df=groups.reset_index()
    groups_df.rename({0: 'Customers'}, axis=1, inplace=True)

    axs[i].set_title(f'{feature}')
    sns.barplot(groups_df, x=feature, y='Customers', hue='y', ax=axs[i])
    axs[i].tick_params(axis='x', labelrotation=45)
    axs[i].set_ylabel('Customers')

fig.tight_layout()
fig.show()

In [None]:
numerical_features=['age','duration','campaign','pdays','previous','emp.var.rate','cons.price.idx','cons.conf.idx','euribor3m','nr.employed']

In [None]:
fig, axs=plt.subplots(4,3, figsize=(12,10))
axs=axs.flatten()

fig.suptitle('Comparison of neighborhood group and interval variables')

for i, feature in enumerate(numerical_features):

    plot_df=training_df[training_df[feature] != 0]

    if stats.kurtosis(plot_df[feature].dropna()) > 20:
        log_scale=True
    else:
        log_scale=False

    sns.boxplot(training_df, x='y', y=feature, log_scale=log_scale, ax=axs[i])
    axs[i].tick_params(axis='x', labelrotation=45)
    axs[i].set_xlabel('')
    axs[i].set_ylabel(feature)

plt.tight_layout()
plt.show()

In [None]:
feature_drops=['pdays']
training_df.drop(feature_drops, axis=1, inplace=True)
testing_df.drop(feature_drops, axis=1, inplace=True)

### 2.4. Feature encoding & scaling

In [None]:
categorical_features=['job','marital','default','housing','loan','contact','poutcome']

encoder=OneHotEncoder(drop='first', sparse_output=False)
encoder.fit(training_df[categorical_features])
encoded_training_features=encoder.transform(training_df[categorical_features])
encoded_testing_features=encoder.transform(testing_df[categorical_features])

encoded_training_features_df=pd.DataFrame(
    encoded_training_features,
    columns=encoder.get_feature_names_out()
)

encoded_testing_features_df=pd.DataFrame(
    encoded_testing_features,
    columns=encoder.get_feature_names_out()
)

encoded_training_features_df.head().transpose()

In [None]:
numerical_features=['age','duration','campaign','previous','emp.var.rate','cons.price.idx','cons.conf.idx','euribor3m','nr.employed']

standard_scaler=StandardScaler().fit(training_df[numerical_features])
scaled_training_features=standard_scaler.transform(training_df[numerical_features])
scaled_testing_features=standard_scaler.transform(testing_df[numerical_features])

scaled_training_features_df=pd.DataFrame(
    scaled_training_features,
    columns=numerical_features
)

scaled_testing_features_df=pd.DataFrame(
    scaled_testing_features,
    columns=numerical_features
)

scaled_training_features_df.describe()

In [None]:
def encode_time_features(data_df: pd.DataFrame) -> pd.DataFrame:
    '''Takes a Pandas dataframe and uses cyclical sin/cos to encode
    month and day features. Returns updated dataframe.'''

    time_df=data_df.copy()

    # First convert the features to numeric
    dict={'mon' : '1', 'tue' : '2', 'wed': '3', 'thu' : '4', 'fri': '5'}
    time_df['day_of_week']=time_df['day_of_week'].replace(dict)

    dict={'jan' : '1', 'feb' : '2', 'mar': '3', 'apr' : '4', 'may': '5', 'jun': '6', 'jul': '7', 'aug': '8', 'sep': '9', 'oct': '10', 'nov': '11', 'dec': '12'}
    time_df['month']=time_df['month'].replace(dict)

    # And fix the dtypes
    time_df['day_of_week']=time_df['day_of_week'].astype(float)
    time_df['month']=time_df['month'].astype(float)

    # Now encode the day and month with sin/cos components
    time_df['day_sin']=np.sin(2 * np.pi * time_df['day_of_week']/7.0)
    time_df['day_cos']=np.cos(2 * np.pi * time_df['day_of_week']/7.0)

    time_df['month_sin']=np.sin(2 * np.pi * time_df['month']/12.0)
    time_df['month_cos']=np.cos(2 * np.pi * time_df['month']/12.0)

    # Drop the original string features
    time_df.drop(['month', 'day_of_week'], axis=1, inplace=True)

    return time_df.reset_index(drop=True)

training_time_features=encode_time_features(training_df[['day_of_week', 'month']])
testing_time_features=encode_time_features(testing_df[['day_of_week', 'month']])

training_time_features.info()

In [None]:
# Last, encode the labels
label_encoder=LabelEncoder().fit(training_df['y'])
training_labels=label_encoder.transform(training_df['y'])
testing_labels=label_encoder.transform(testing_df['y'])

In [None]:
# Combine the dataframes back together
training_features=pd.concat([encoded_training_features_df, scaled_training_features_df, training_time_features], axis=1)
testing_features=pd.concat([encoded_testing_features_df, scaled_testing_features_df, testing_time_features], axis=1)

## 3. Model training

In [None]:
model=LogisticRegression()
model.fit(training_features, training_labels)

test_predictions=model.predict(testing_features)
test_accuracy=accuracy_score(testing_labels, test_predictions)*100
print(f'Test set accuracy: {test_accuracy:.2f}%')


## 4. Model optimization

### 4.1. Hyperparameter optimization

In [None]:
hyperparameters={
    'C': [0.125, 0.25, 0.5, 1, 2, 4, 8],
    #'solver': ['newton-cg', 'lbfgs', 'liblinear'],
    'penalty': ['l1', 'l2', 'elasticnet', None],
    'fit_intercept': [True, False],
    'max_iter': [50, 100, 200]
}

model=LogisticRegression()
grid=GridSearchCV(model, hyperparameters, scoring='accuracy', cv=5)
grid.fit(training_features, training_labels)
winning_parameters=grid.best_params_
print(f'Best hyperparameters: {winning_parameters}')

### 4.3. Final model evaluation

In [None]:
model=LogisticRegression(**winning_parameters)
model.fit(training_features, training_labels)

test_predictions=model.predict(testing_features)
test_accuracy=accuracy_score(testing_labels, test_predictions)*100
print(f'Test set accuracy: {test_accuracy:.2f}%')