# Logistic Regression: Banking Marketing Campaign

## 1. Data loading

In [1]:
# Handle imports upfront
import pandas as pd

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.linear_model import LogisticRegression

### 1.1. Load

In [2]:
data_url='https://raw.githubusercontent.com/4GeeksAcademy/logistic-regression-project-tutorial/main/bank-marketing-campaign-data.csv'
data_df=pd.read_csv(data_url, sep=';')

### 1.2. Inspect

In [3]:
# Your code here....
# Your code here....
print(data_df.head())

print(data_df.describe())

print(data_df.info())

   age        job  marital    education  default housing loan    contact  \
0   56  housemaid  married     basic.4y       no      no   no  telephone   
1   57   services  married  high.school  unknown      no   no  telephone   
2   37   services  married  high.school       no     yes   no  telephone   
3   40     admin.  married     basic.6y       no      no   no  telephone   
4   56   services  married  high.school       no      no  yes  telephone   

  month day_of_week  ...  campaign  pdays  previous     poutcome emp.var.rate  \
0   may         mon  ...         1    999         0  nonexistent          1.1   
1   may         mon  ...         1    999         0  nonexistent          1.1   
2   may         mon  ...         1    999         0  nonexistent          1.1   
3   may         mon  ...         1    999         0  nonexistent          1.1   
4   may         mon  ...         1    999         0  nonexistent          1.1   

   cons.price.idx  cons.conf.idx  euribor3m  nr.employed

### 1.3. Train-test split

In [4]:
# First separate the features from the labels
labels=data_df['y']
features=data_df.drop('y', axis=1)

# Do the test-train split
training_features, testing_features, training_labels, testing_labels=train_test_split(
    features,
    labels,
    test_size=0.25, 
    random_state=315
)

### 1.4. Encoding

In [5]:
# Names of columns we want to encode
encoded_columns=['job']

# Do the encoding
training_features=pd.get_dummies(training_features, columns=encoded_columns, dtype=int, drop_first=True)
testing_features=pd.get_dummies(testing_features, columns=encoded_columns, dtype=int, drop_first=True)

training_features.info()

<class 'pandas.core.frame.DataFrame'>
Index: 30891 entries, 33905 to 29283
Data columns (total 30 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   age                30891 non-null  int64  
 1   marital            30891 non-null  object 
 2   education          30891 non-null  object 
 3   default            30891 non-null  object 
 4   housing            30891 non-null  object 
 5   loan               30891 non-null  object 
 6   contact            30891 non-null  object 
 7   month              30891 non-null  object 
 8   day_of_week        30891 non-null  object 
 9   duration           30891 non-null  int64  
 10  campaign           30891 non-null  int64  
 11  pdays              30891 non-null  int64  
 12  previous           30891 non-null  int64  
 13  poutcome           30891 non-null  object 
 14  emp.var.rate       30891 non-null  float64
 15  cons.price.idx     30891 non-null  float64
 16  cons.conf.idx      3089

In [6]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

training_features['marital'] = label_encoder.fit_transform(training_features['marital'])

training_features['education'] = label_encoder.fit_transform(training_features['education'])

training_features['feature_name'] = label_encoder.fit_transform(training_features['feature_name'])


KeyError: 'feature_name'

## 2. EDA

### 2.1. Baseline model performance

In [None]:
# Define a reusable helper function for cross-validation here. We are going to
# be doing a lot of cross-validation, this allows us to reuse this code
# without having to copy-paste it over and over.

def cross_val(model, features: pd.DataFrame, labels: pd.Series) -> list[float]:
    '''Reusable helper function to run cross-validation on a model. Takes model,
    Pandas data frame of features and Pandas data series of labels. Returns 
    list of cross-validation fold accuracy scores as percents.'''

    # Define the cross-validation strategy
    cross_validation=StratifiedKFold(n_splits=7, shuffle=True, random_state=315)

    # Run the cross-validation, collecting the scores
    scores=cross_val_score(
        model,
        features,
        labels,
        cv=cross_validation,
        n_jobs=-1,
        scoring='accuracy'
    )

    # Print mean and standard deviation of the scores
    print(f'Cross-validation accuracy: {(scores.mean() * 100):.2f} +/- {(scores.std() * 100):.2f}%')

    # Return the scores
    return scores

In [None]:
# Instantiate a random forest classifier model
model=LogisticRegression(random_state=315)

# Run the cross-validation
scores=cross_val(model, training_features, training_labels)

### 2.2. Missing and/or extreme values

In [None]:
# Your code here...

### 2.3. Feature selection

In [None]:
# Your code here...

### 2.4. Feature scaling

In [None]:
# Scale the features
standard_scaler=StandardScaler().fit(training_features)
training_features=standard_scaler.transform(training_features)
testing_features=standard_scaler.transform(testing_features)

print(f'Training features are: {type(training_features)}')
print(f'Training features shape: {training_features.shape}')

### 2.5. Label encoding

In [None]:
# Last, encode the labels
label_encoder=LabelEncoder().fit(training_labels)
training_labels=label_encoder.transform(training_labels)
testing_labels=label_encoder.transform(testing_labels)

print(f'Training labels: {training_labels}')

## 3. Model training

In [None]:
# Your code here...

## 4. Model optimization

### 4.1. Model tuning

In [None]:
# Your code here...

### 4.2. Cross-validation of optimized model

In [None]:
# Your code here...

### 4.3. Final model evaluation

In [None]:
# Your code here...