# Logistic Regression: Banking Marketing Campaign

## 1. Data loading

In [46]:
# Handle imports upfront
import pandas as pd

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.linear_model import LogisticRegression

### 1.1. Load

In [47]:
data_url='https://raw.githubusercontent.com/4GeeksAcademy/logistic-regression-project-tutorial/main/bank-marketing-campaign-data.csv'
data_df=pd.read_csv(data_url, sep=';')

### 1.2. Inspect

In [48]:
# Your code here....
# Your code here....
print(data_df.head())

print(data_df.describe())

print(data_df.info())

   age        job  marital    education  default housing loan    contact  \
0   56  housemaid  married     basic.4y       no      no   no  telephone   
1   57   services  married  high.school  unknown      no   no  telephone   
2   37   services  married  high.school       no     yes   no  telephone   
3   40     admin.  married     basic.6y       no      no   no  telephone   
4   56   services  married  high.school       no      no  yes  telephone   

  month day_of_week  ...  campaign  pdays  previous     poutcome emp.var.rate  \
0   may         mon  ...         1    999         0  nonexistent          1.1   
1   may         mon  ...         1    999         0  nonexistent          1.1   
2   may         mon  ...         1    999         0  nonexistent          1.1   
3   may         mon  ...         1    999         0  nonexistent          1.1   
4   may         mon  ...         1    999         0  nonexistent          1.1   

   cons.price.idx  cons.conf.idx  euribor3m  nr.employed

### 1.3. Train-test split

In [49]:
# First separate the features from the labels
labels=data_df['y']
features=data_df.drop('y', axis=1)

# Do the test-train split
training_features, testing_features, training_labels, testing_labels=train_test_split(
    features,
    labels,
    test_size=0.25, 
    random_state=315
)

### 1.4. Encoding

In [50]:
# Names of columns we want to encode
encoded_columns=['job']

# Do the encoding
training_features=pd.get_dummies(training_features, columns=encoded_columns, dtype=int, drop_first=True)
testing_features=pd.get_dummies(testing_features, columns=encoded_columns, dtype=int, drop_first=True)

training_features.info()

<class 'pandas.core.frame.DataFrame'>
Index: 30891 entries, 33905 to 29283
Data columns (total 30 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   age                30891 non-null  int64  
 1   marital            30891 non-null  object 
 2   education          30891 non-null  object 
 3   default            30891 non-null  object 
 4   housing            30891 non-null  object 
 5   loan               30891 non-null  object 
 6   contact            30891 non-null  object 
 7   month              30891 non-null  object 
 8   day_of_week        30891 non-null  object 
 9   duration           30891 non-null  int64  
 10  campaign           30891 non-null  int64  
 11  pdays              30891 non-null  int64  
 12  previous           30891 non-null  int64  
 13  poutcome           30891 non-null  object 
 14  emp.var.rate       30891 non-null  float64
 15  cons.price.idx     30891 non-null  float64
 16  cons.conf.idx      3089

In [51]:
print(training_features.dtypes)

age                    int64
marital               object
education             object
default               object
housing               object
loan                  object
contact               object
month                 object
day_of_week           object
duration               int64
campaign               int64
pdays                  int64
previous               int64
poutcome              object
emp.var.rate         float64
cons.price.idx       float64
cons.conf.idx        float64
euribor3m            float64
nr.employed          float64
job_blue-collar        int64
job_entrepreneur       int64
job_housemaid          int64
job_management         int64
job_retired            int64
job_self-employed      int64
job_services           int64
job_student            int64
job_technician         int64
job_unemployed         int64
job_unknown            int64
dtype: object


In [52]:
from sklearn.preprocessing import LabelEncoder

def feature_composition(df: pd.DataFrame, features: list) -> None:
    '''Takes a dataframe and a list of features. Prints out
    the unique levels of that feature with their count and 
    percent.'''

    for i, column_name in enumerate(features):
        value_counts=df[column_name].value_counts().T.to_dict()

        print(f'\nFeature: {column_name}')

        for key, value in value_counts.items():
            percent_value=(value/len(data_df)) * 100
            print(f' {key}: {value} ({percent_value:.1f}%)')

label_encoder = LabelEncoder()

training_features['marital'] = label_encoder.fit_transform(training_features['marital'])

training_features['education'] = label_encoder.fit_transform(training_features['education'])

training_features['contact'] = label_encoder.fit_transform(training_features['contact'])

feature_composition(training_features, ['default','housing','loan'])


Feature: default
 no: 24400 (59.2%)
 unknown: 6488 (15.8%)
 yes: 3 (0.0%)

Feature: housing
 yes: 16072 (39.0%)
 no: 14075 (34.2%)
 unknown: 744 (1.8%)

Feature: loan
 no: 25468 (61.8%)
 yes: 4679 (11.4%)
 unknown: 744 (1.8%)


In [53]:
# Names of columns to drop
column_drops=['poutcome', 'pdays', 'previous', 'default', 'housing', 'loan']

# Do the drops
training_features.drop(column_drops, axis=1, inplace=True)

# Do the same thing to the testing data
testing_features.drop(column_drops, axis=1, inplace=True)

# Take a look
training_features.info()

<class 'pandas.core.frame.DataFrame'>
Index: 30891 entries, 33905 to 29283
Data columns (total 24 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   age                30891 non-null  int64  
 1   marital            30891 non-null  int64  
 2   education          30891 non-null  int64  
 3   contact            30891 non-null  int64  
 4   month              30891 non-null  object 
 5   day_of_week        30891 non-null  object 
 6   duration           30891 non-null  int64  
 7   campaign           30891 non-null  int64  
 8   emp.var.rate       30891 non-null  float64
 9   cons.price.idx     30891 non-null  float64
 10  cons.conf.idx      30891 non-null  float64
 11  euribor3m          30891 non-null  float64
 12  nr.employed        30891 non-null  float64
 13  job_blue-collar    30891 non-null  int64  
 14  job_entrepreneur   30891 non-null  int64  
 15  job_housemaid      30891 non-null  int64  
 16  job_management     3089

## 2. EDA

### 2.1. Baseline model performance

In [54]:
# Define a reusable helper function for cross-validation here. We are going to
# be doing a lot of cross-validation, this allows us to reuse this code
# without having to copy-paste it over and over.

def cross_val(model, features: pd.DataFrame, labels: pd.Series) -> list[float]:
    '''Reusable helper function to run cross-validation on a model. Takes model,
    Pandas data frame of features and Pandas data series of labels. Returns 
    list of cross-validation fold accuracy scores as percents.'''

    # Define the cross-validation strategy
    cross_validation=StratifiedKFold(n_splits=7, shuffle=True, random_state=315)

    # Run the cross-validation, collecting the scores
    scores=cross_val_score(
        model,
        features,
        labels,
        cv=cross_validation,
        n_jobs=-1,
        scoring='accuracy'
    )

    # Print mean and standard deviation of the scores
    print(f'Cross-validation accuracy: {(scores.mean() * 100):.2f} +/- {(scores.std() * 100):.2f}%')

    # Return the scores
    return scores

In [55]:
feature_composition(training_features, ['day_of_week'])

feature_composition(training_features, ['month'])


Feature: day_of_week
 thu: 6479 (15.7%)
 mon: 6350 (15.4%)
 wed: 6089 (14.8%)
 tue: 6079 (14.8%)
 fri: 5894 (14.3%)

Feature: month
 may: 10246 (24.9%)
 jul: 5334 (13.0%)
 aug: 4668 (11.3%)
 jun: 4047 (9.8%)
 nov: 3113 (7.6%)
 apr: 1965 (4.8%)
 oct: 541 (1.3%)
 sep: 434 (1.1%)
 mar: 402 (1.0%)
 dec: 141 (0.3%)


In [56]:
import numpy as np

# Don't worry about downcasting FutureWarning
pd.set_option('future.no_silent_downcasting', True)

# Define a helper function here so we can encode the time
# features the same way on the training and testing data
# without copy-pasting the same code
def encode_time_features(data_df: pd.DataFrame) -> pd.DataFrame:
    '''Takes a Pandas dataframe and uses cyclical sin/cos to encode
    month and day features. Returns updated dataframe.'''

    # First convert the features to numeric
    dict={'mon' : 1, 'tue' : 2, 'wed': 3, 'thu' : 4, 'fri': 5}
    data_df=data_df.replace(dict)

    dict={'jan' : 1, 'feb' : 2, 'mar': 3, 'apr' : 4, 'may': 5, 'jun': 6, 'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12}
    data_df=data_df.replace(dict)

    # And fix the dtypes
    data_df['day_of_week']=data_df['day_of_week'].astype(int)
    data_df['month']=data_df['month'].astype(int)

    # Now encode the day and month with sin/cos components
    data_df['day_sin'] = np.sin(2 * np.pi * data_df['day_of_week']/7.0)
    data_df['day_cos'] = np.cos(2 * np.pi * data_df['day_of_week']/7.0)

    data_df['month_sin'] = np.sin(2 * np.pi * data_df['month']/12.0)
    data_df['month_cos'] = np.cos(2 * np.pi * data_df['month']/12.0)

    # Drop the original string features
    data_df.drop(['month', 'day_of_week'], axis=1, inplace=True)

    return data_df

training_features=encode_time_features(training_features)
testing_features=encode_time_features(testing_features)

In [57]:
# Instantiate a random forest classifier model
model=LogisticRegression(random_state=315)

# Run the cross-validation
scores=cross_val(model, training_features, training_labels)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Cross-validation accuracy: 90.61 +/- 0.37%


### 2.2. Missing and/or extreme values

In [58]:
# Your code here...
data_df.drop_duplicates(inplace=True)
data_df.shape

(41176, 21)

### 2.3. Feature selection

In [59]:
# Your code here...
columns = ['job', 'marital','education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome', 'y']

for column in columns:
    print (f'Value count for column - {column}')
    print(data_df[column].value_counts())
    print('\n\n')

Value count for column - job
job
admin.           10419
blue-collar       9253
technician        6739
services          3967
management        2924
retired           1718
entrepreneur      1456
self-employed     1421
housemaid         1060
unemployed        1014
student            875
unknown            330
Name: count, dtype: int64



Value count for column - marital
marital
married     24921
single      11564
divorced     4611
unknown        80
Name: count, dtype: int64



Value count for column - education
education
university.degree      12164
high.school             9512
basic.9y                6045
professional.course     5240
basic.4y                4176
basic.6y                2291
unknown                 1730
illiterate                18
Name: count, dtype: int64



Value count for column - default
default
no         32577
unknown     8596
yes            3
Name: count, dtype: int64



Value count for column - housing
housing
yes        21571
no         18615
unknown      990
N

In [60]:
data_df = data_df.replace('unknown', np.nan)
data_df

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41183,73,retired,married,professional.course,no,yes,no,cellular,nov,fri,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,yes
41184,46,blue-collar,married,professional.course,no,no,no,cellular,nov,fri,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,no
41185,56,retired,married,university.degree,no,yes,no,cellular,nov,fri,...,2,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,no
41186,44,technician,married,professional.course,no,no,no,cellular,nov,fri,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,yes


In [61]:
for var in data_df.columns[data_df.dtypes == 'object']:
    data_df[var] = data_df[var].fillna(data_df[var].mode()[0])

In [62]:
for var in data_df.columns[data_df.dtypes == 'int64']:
    data_df[var] = data_df[var].fillna(data_df[var].mean())

In [63]:
data_df = pd.get_dummies(data_df, columns=['y','job','marital','education','default','housing','loan','contact','month','day_of_week','poutcome'], drop_first=True)
data_df.describe()

Unnamed: 0,age,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed
count,41176.0,41176.0,41176.0,41176.0,41176.0,41176.0,41176.0,41176.0,41176.0,41176.0
mean,40.0238,258.315815,2.567879,962.46481,0.173013,0.081922,93.57572,-40.502863,3.621293,5167.03487
std,10.42068,259.305321,2.770318,186.937102,0.494964,1.570883,0.578839,4.62786,1.734437,72.251364
min,17.0,0.0,1.0,0.0,0.0,-3.4,92.201,-50.8,0.634,4963.6
25%,32.0,102.0,1.0,999.0,0.0,-1.8,93.075,-42.7,1.344,5099.1
50%,38.0,180.0,2.0,999.0,0.0,1.1,93.749,-41.8,4.857,5191.0
75%,47.0,319.0,3.0,999.0,0.0,1.4,93.994,-36.4,4.961,5228.1
max,98.0,4918.0,56.0,999.0,7.0,1.4,94.767,-26.9,5.045,5228.1


### 2.4. Feature scaling

In [64]:
# Scale the features
standard_scaler=StandardScaler().fit(training_features)
training_features=standard_scaler.transform(training_features)
testing_features=standard_scaler.transform(testing_features)

print(f'Training features are: {type(training_features)}')
print(f'Training features shape: {training_features.shape}')

ValueError: could not convert string to float: 'married'

### 2.5. Label encoding

In [None]:
# Last, encode the labels
label_encoder=LabelEncoder().fit(training_labels)
training_labels=label_encoder.transform(training_labels)
testing_labels=label_encoder.transform(testing_labels)

print(f'Training labels: {training_labels}')

## 3. Model training

In [None]:
# Your code here...

## 4. Model optimization

### 4.1. Model tuning

In [None]:
# Your code here...

### 4.2. Cross-validation of optimized model

In [None]:
# Your code here...

### 4.3. Final model evaluation

In [None]:
# Your code here...