# Logistic Regression: Banking Marketing Campaign

## 1. Data loading

In [1]:
# Handle imports upfront
import pandas as pd

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.linear_model import LogisticRegression

### 1.1. Load

In [2]:
data_url='https://raw.githubusercontent.com/4GeeksAcademy/logistic-regression-project-tutorial/main/bank-marketing-campaign-data.csv'
data_df=pd.read_csv(data_url, sep=';')

### 1.2. Inspect

In [3]:
# Your code here....
# Your code here....
print(data_df.head())

print(data_df.describe())

print(data_df.info())

   age        job  marital    education  default housing loan    contact  \
0   56  housemaid  married     basic.4y       no      no   no  telephone   
1   57   services  married  high.school  unknown      no   no  telephone   
2   37   services  married  high.school       no     yes   no  telephone   
3   40     admin.  married     basic.6y       no      no   no  telephone   
4   56   services  married  high.school       no      no  yes  telephone   

  month day_of_week  ...  campaign  pdays  previous     poutcome emp.var.rate  \
0   may         mon  ...         1    999         0  nonexistent          1.1   
1   may         mon  ...         1    999         0  nonexistent          1.1   
2   may         mon  ...         1    999         0  nonexistent          1.1   
3   may         mon  ...         1    999         0  nonexistent          1.1   
4   may         mon  ...         1    999         0  nonexistent          1.1   

   cons.price.idx  cons.conf.idx  euribor3m  nr.employed

### 1.3. Train-test split

In [4]:
# First separate the features from the labels
labels=data_df['y']
features=data_df.drop('y', axis=1)

# Do the test-train split
training_features, testing_features, training_labels, testing_labels=train_test_split(
    features,
    labels,
    test_size=0.25, 
    random_state=315
)

### 1.4. Encoding

In [5]:
# Names of columns we want to encode
encoded_columns=['job','marital','default','housing','loan','contact']

# Do the encoding
training_features=pd.get_dummies(training_features, columns=encoded_columns, dtype=int, drop_first=True)
testing_features=pd.get_dummies(testing_features, columns=encoded_columns, dtype=int, drop_first=True)

training_features.info()

<class 'pandas.core.frame.DataFrame'>
Index: 30891 entries, 33905 to 29283
Data columns (total 35 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   age                30891 non-null  int64  
 1   education          30891 non-null  object 
 2   month              30891 non-null  object 
 3   day_of_week        30891 non-null  object 
 4   duration           30891 non-null  int64  
 5   campaign           30891 non-null  int64  
 6   pdays              30891 non-null  int64  
 7   previous           30891 non-null  int64  
 8   poutcome           30891 non-null  object 
 9   emp.var.rate       30891 non-null  float64
 10  cons.price.idx     30891 non-null  float64
 11  cons.conf.idx      30891 non-null  float64
 12  euribor3m          30891 non-null  float64
 13  nr.employed        30891 non-null  float64
 14  job_blue-collar    30891 non-null  int64  
 15  job_entrepreneur   30891 non-null  int64  
 16  job_housemaid      3089

In [6]:
print(training_features.dtypes)

age                    int64
education             object
month                 object
day_of_week           object
duration               int64
campaign               int64
pdays                  int64
previous               int64
poutcome              object
emp.var.rate         float64
cons.price.idx       float64
cons.conf.idx        float64
euribor3m            float64
nr.employed          float64
job_blue-collar        int64
job_entrepreneur       int64
job_housemaid          int64
job_management         int64
job_retired            int64
job_self-employed      int64
job_services           int64
job_student            int64
job_technician         int64
job_unemployed         int64
job_unknown            int64
marital_married        int64
marital_single         int64
marital_unknown        int64
default_unknown        int64
default_yes            int64
housing_unknown        int64
housing_yes            int64
loan_unknown           int64
loan_yes               int64
contact_teleph

In [7]:
training_features.head()

Unnamed: 0,age,education,month,day_of_week,duration,campaign,pdays,previous,poutcome,emp.var.rate,...,marital_married,marital_single,marital_unknown,default_unknown,default_yes,housing_unknown,housing_yes,loan_unknown,loan_yes,contact_telephone
33905,31,high.school,may,wed,14,1,999,0,nonexistent,-1.8,...,1,0,0,0,0,0,1,0,0,0
35981,28,university.degree,may,mon,315,1,999,0,nonexistent,-1.8,...,0,1,0,0,0,0,0,0,1,0
36492,58,basic.4y,jun,wed,86,1,999,0,nonexistent,-2.9,...,1,0,0,0,0,0,1,0,0,0
26747,49,high.school,nov,thu,73,1,999,0,nonexistent,-0.1,...,0,0,1,0,0,0,1,0,0,0
27508,35,university.degree,nov,fri,360,2,999,0,nonexistent,-0.1,...,0,1,0,0,0,0,0,0,0,0


In [8]:
# Names of columns to drop
column_drops=['poutcome', 'pdays', 'previous']

# Do the drops
training_features.drop(column_drops, axis=1, inplace=True)

# Do the same thing to the testing data
testing_features.drop(column_drops, axis=1, inplace=True)

# Take a look
training_features.info()

<class 'pandas.core.frame.DataFrame'>
Index: 30891 entries, 33905 to 29283
Data columns (total 32 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   age                30891 non-null  int64  
 1   education          30891 non-null  object 
 2   month              30891 non-null  object 
 3   day_of_week        30891 non-null  object 
 4   duration           30891 non-null  int64  
 5   campaign           30891 non-null  int64  
 6   emp.var.rate       30891 non-null  float64
 7   cons.price.idx     30891 non-null  float64
 8   cons.conf.idx      30891 non-null  float64
 9   euribor3m          30891 non-null  float64
 10  nr.employed        30891 non-null  float64
 11  job_blue-collar    30891 non-null  int64  
 12  job_entrepreneur   30891 non-null  int64  
 13  job_housemaid      30891 non-null  int64  
 14  job_management     30891 non-null  int64  
 15  job_retired        30891 non-null  int64  
 16  job_self-employed  3089

## 2. EDA

### 2.1. Baseline model performance

In [9]:
# Define a reusable helper function for cross-validation here. We are going to
# be doing a lot of cross-validation, this allows us to reuse this code
# without having to copy-paste it over and over.

def cross_val(model, features: pd.DataFrame, labels: pd.Series) -> list[float]:
    '''Reusable helper function to run cross-validation on a model. Takes model,
    Pandas data frame of features and Pandas data series of labels. Returns 
    list of cross-validation fold accuracy scores as percents.'''

    # Define the cross-validation strategy
    cross_validation=StratifiedKFold(n_splits=7, shuffle=True, random_state=315)

    # Run the cross-validation, collecting the scores
    scores=cross_val_score(
        model,
        features,
        labels,
        cv=cross_validation,
        n_jobs=-1,
        scoring='accuracy'
    )

    # Print mean and standard deviation of the scores
    print(f'Cross-validation accuracy: {(scores.mean() * 100):.2f} +/- {(scores.std() * 100):.2f}%')

    # Return the scores
    return scores

In [12]:
import numpy as np

# Don't worry about downcasting FutureWarning
pd.set_option('future.no_silent_downcasting', True)

# Define a helper function here so we can encode the time
# features the same way on the training and testing data
# without copy-pasting the same code
def encode_time_features(data_df: pd.DataFrame) -> pd.DataFrame:
    '''Takes a Pandas dataframe and uses cyclical sin/cos to encode
    month and day features. Returns updated dataframe.'''

    # First convert the features to numeric
    dict={'mon' : 1, 'tue' : 2, 'wed': 3, 'thu' : 4, 'fri': 5}
    data_df=data_df.replace(dict)

    dict={'jan' : 1, 'feb' : 2, 'mar': 3, 'apr' : 4, 'may': 5, 'jun': 6, 'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12}
    data_df=data_df.replace(dict)

    # And fix the dtypes
    data_df['day_of_week']=data_df['day_of_week'].astype(int)
    data_df['month']=data_df['month'].astype(int)

    # Now encode the day and month with sin/cos components
    data_df['day_sin'] = np.sin(2 * np.pi * data_df['day_of_week']/7.0)
    data_df['day_cos'] = np.cos(2 * np.pi * data_df['day_of_week']/7.0)

    data_df['month_sin'] = np.sin(2 * np.pi * data_df['month']/12.0)
    data_df['month_cos'] = np.cos(2 * np.pi * data_df['month']/12.0)

    # Drop the original string features
    data_df.drop(['month', 'day_of_week'], axis=1, inplace=True)

    return data_df

training_features=encode_time_features(training_features)
testing_features=encode_time_features(testing_features)

KeyError: 'day_of_week'

### 2.2. Missing and/or extreme values

In [14]:
# Your code here...
data_df.drop_duplicates(inplace=True)
data_df.shape

(41176, 21)

### 2.3. Feature selection

In [15]:
# Your code here...
columns = ['job', 'marital','education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome', 'y']

for column in columns:
    print (f'Value count for column - {column}')
    print(data_df[column].value_counts())
    print('\n\n')

Value count for column - job
job
admin.           10419
blue-collar       9253
technician        6739
services          3967
management        2924
retired           1718
entrepreneur      1456
self-employed     1421
housemaid         1060
unemployed        1014
student            875
unknown            330
Name: count, dtype: int64



Value count for column - marital
marital
married     24921
single      11564
divorced     4611
unknown        80
Name: count, dtype: int64



Value count for column - education
education
university.degree      12164
high.school             9512
basic.9y                6045
professional.course     5240
basic.4y                4176
basic.6y                2291
unknown                 1730
illiterate                18
Name: count, dtype: int64



Value count for column - default
default
no         32577
unknown     8596
yes            3
Name: count, dtype: int64



Value count for column - housing
housing
yes        21571
no         18615
unknown      990
N

In [16]:
from sklearn.preprocessing import MinMaxScaler

# Initialize the scaler
scaler = MinMaxScaler()

# Fit the scaler to the selected columns
scaler.fit(data_df[['campaign','previous','emp.var.rate','cons.price.idx','cons.conf.idx','euribor3m','nr.employed']])

# Transform the selected columns and assign the result back to those columns in the DataFrame
data_df[['campaign','previous','emp.var.rate','cons.price.idx','cons.conf.idx','euribor3m','nr.employed']] = scaler.transform(data_df[['campaign','previous','emp.var.rate','cons.price.idx','cons.conf.idx','euribor3m','nr.employed']])


In [17]:
data_df = data_df.replace('unknown', np.nan)
data_df

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,0.000000,999,0.000000,nonexistent,0.937500,0.698753,0.60251,0.957379,0.859735,no
1,57,services,married,high.school,,no,no,telephone,may,mon,...,0.000000,999,0.000000,nonexistent,0.937500,0.698753,0.60251,0.957379,0.859735,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,0.000000,999,0.000000,nonexistent,0.937500,0.698753,0.60251,0.957379,0.859735,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,0.000000,999,0.000000,nonexistent,0.937500,0.698753,0.60251,0.957379,0.859735,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,0.000000,999,0.000000,nonexistent,0.937500,0.698753,0.60251,0.957379,0.859735,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41183,73,retired,married,professional.course,no,yes,no,cellular,nov,fri,...,0.000000,999,0.000000,nonexistent,0.479167,1.000000,0.00000,0.089322,0.000000,yes
41184,46,blue-collar,married,professional.course,no,no,no,cellular,nov,fri,...,0.000000,999,0.000000,nonexistent,0.479167,1.000000,0.00000,0.089322,0.000000,no
41185,56,retired,married,university.degree,no,yes,no,cellular,nov,fri,...,0.018182,999,0.000000,nonexistent,0.479167,1.000000,0.00000,0.089322,0.000000,no
41186,44,technician,married,professional.course,no,no,no,cellular,nov,fri,...,0.000000,999,0.000000,nonexistent,0.479167,1.000000,0.00000,0.089322,0.000000,yes


In [18]:
for var in data_df.columns[data_df.dtypes == 'object']:
    data_df[var] = data_df[var].fillna(data_df[var].mode()[0])

In [19]:
for var in data_df.columns[data_df.dtypes == 'int64']:
    data_df[var] = data_df[var].fillna(data_df[var].mean())

In [20]:
data_df = pd.get_dummies(data_df, columns=['y','job','marital','education','default','housing','loan','contact','month','day_of_week','poutcome'], drop_first=True)
data_df.describe()

Unnamed: 0,age,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed
count,41176.0,41176.0,41176.0,41176.0,41176.0,41176.0,41176.0,41176.0,41176.0,41176.0
mean,40.0238,258.315815,0.028507,962.46481,0.024716,0.7254,0.535744,0.430843,0.677237,0.76913
std,10.42068,259.305321,0.050369,186.937102,0.070709,0.327267,0.22558,0.193634,0.393207,0.273162
min,17.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,32.0,102.0,0.0,999.0,0.0,0.333333,0.340608,0.338912,0.160961,0.512287
50%,38.0,180.0,0.018182,999.0,0.0,0.9375,0.603274,0.376569,0.957379,0.859735
75%,47.0,319.0,0.036364,999.0,0.0,1.0,0.698753,0.60251,0.980957,1.0
max,98.0,4918.0,1.0,999.0,1.0,1.0,1.0,1.0,1.0,1.0


In [22]:
# Dictionary to translate employment string to numerical value
dict={'unknown': 0, 'illiterate': 1, 'basic.4y': 2, 'basic.6y': 3, 'basic.9y': 4, 'high.school': 5, 'professional.course': 6, 'university.degree': 7}

# Encode the training and testing data
training_features=training_features.replace(dict)
testing_features=testing_features.replace(dict)

# Fix the dtypes
training_features['education']=training_features['education'].astype(int)
testing_features['education']=testing_features['education'].astype(int)

### 2.4. Feature scaling

In [23]:
from sklearn.preprocessing import StandardScaler

# Scale the features
standard_scaler=StandardScaler().fit(training_features)
training_features=standard_scaler.transform(training_features)
testing_features=standard_scaler.transform(testing_features)

print(f'Training features are: {type(training_features)}')
print(f'Training features shape: {training_features.shape}')

ValueError: The feature names should match those that were passed during fit.
Feature names seen at fit time, yet now missing:
- default_yes


### 2.5. Label encoding

In [None]:
# Last, encode the labels
label_encoder=LabelEncoder().fit(training_labels)
training_labels=label_encoder.transform(training_labels)
testing_labels=label_encoder.transform(testing_labels)

print(f'Training labels: {training_labels}')

## 3. Model training

In [None]:
# Your code here...

## 4. Model optimization

### 4.1. Model tuning

In [None]:
# Your code here...

### 4.2. Cross-validation of optimized model

In [None]:
# Your code here...

### 4.3. Final model evaluation

In [None]:
# Your code here...