# Explore here

Imports

In [31]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import *
from imblearn.metrics import specificity_score
from warnings import simplefilter
from pickle import dump



We need to perform an EDA before modeling, in this exercise I will cover only the basics since the algorithm is the main point.

Collect the data 

In [32]:
all_data = pd.read_csv("https://raw.githubusercontent.com/4GeeksAcademy/logistic-regression-project-tutorial/main/bank-marketing-campaign-data.csv", sep= ";")

all_data["y"].unique()


array(['no', 'yes'], dtype=object)

Now delete all possible duplicates in this dataset, in case there are. Also, obtain the null values and information as well.

In [33]:
all_data = all_data.drop_duplicates().reset_index(drop= True)

all_data.info()
all_data.shape

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41176 entries, 0 to 41175
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             41176 non-null  int64  
 1   job             41176 non-null  object 
 2   marital         41176 non-null  object 
 3   education       41176 non-null  object 
 4   default         41176 non-null  object 
 5   housing         41176 non-null  object 
 6   loan            41176 non-null  object 
 7   contact         41176 non-null  object 
 8   month           41176 non-null  object 
 9   day_of_week     41176 non-null  object 
 10  duration        41176 non-null  int64  
 11  campaign        41176 non-null  int64  
 12  pdays           41176 non-null  int64  
 13  previous        41176 non-null  int64  
 14  poutcome        41176 non-null  object 
 15  emp.var.rate    41176 non-null  float64
 16  cons.price.idx  41176 non-null  float64
 17  cons.conf.idx   41176 non-null 

(41176, 21)

There aren't any null values but some duplicates, in this case, 12 duplicates.

Also, with the .info method we can know that there aren't any null values in this dataset.

We need to also factorize all the categorical variables and create a new dataset

In [34]:
#Factorize categorical variables in a new dataframe
all_data["job_fact"] = pd.factorize(all_data["job"])[0]
all_data["marital_fact"] = pd.factorize(all_data["marital"])[0]
all_data["education_fact"] = pd.factorize(all_data["education"])[0]
all_data["contact_fact"] = pd.factorize(all_data["contact"])[0]


#For default, housing and loan as it's yes, no or unknown
def_house_loan_map = {"no": 0, "yes": 1, "unknown": 2}

#Apply the previous dictionary to map those variables
all_data["housing_fact"] = all_data["housing"].map(def_house_loan_map)
all_data["default_fact"] = all_data["default"].map(def_house_loan_map)
all_data["loan_fact"] = all_data["loan"].map(def_house_loan_map)

#For month and days of the week we apply the same method but assigning a number to each day or month
months_map = {'may': 5, 'jun': 6, 'jul': 7, 'aug': 8, 'oct': 10, 'nov': 11, 'dec': 12, 'mar': 3, 'apr': 4, 'sep': 9} #Applying only to the months that appear in the dataset (using all_data["month"].unique() to know that)
days_map ={'mon': 1, 'tue': 2, 'wed': 3, 'thu': 4, 'fri': 5} #Applying only to the days that appear in the dataset (using all_data["day_of_week"].unique() to know that)

#Apply the previous dictionary to map those variables
all_data["month_fact"] = all_data["month"].map(months_map)
all_data["day_of_week_fact"] = all_data["day_of_week"].map(days_map)

#For outcome variable
pout_map = {"failure": 0, "success": 1, "nonexistent": 2}
all_data["poutcome_fact"] = all_data["poutcome"].map(pout_map)

#For "y" variable, same as default, housing and loan except there is no unknown possibility
y_map = {"no": 0, "yes": 1}
all_data["y_fact"] = all_data["y"].map(y_map)

#Saving the new variables into a list to create a new dataframe with the factorize variables
num_variables = ["job_fact", "marital_fact", "education_fact", "default_fact", "housing_fact", "loan_fact", "contact_fact", "month_fact", "day_of_week_fact", "poutcome_fact",
                 "age", "duration", "campaign", "pdays", "previous", "emp.var.rate", "cons.price.idx", "cons.conf.idx", "euribor3m", "nr.employed", "y_fact"]

fact_data = pd.DataFrame(all_data, index = all_data.index, columns = num_variables)

fact_data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41176 entries, 0 to 41175
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   job_fact          41176 non-null  int64  
 1   marital_fact      41176 non-null  int64  
 2   education_fact    41176 non-null  int64  
 3   default_fact      41176 non-null  int64  
 4   housing_fact      41176 non-null  int64  
 5   loan_fact         41176 non-null  int64  
 6   contact_fact      41176 non-null  int64  
 7   month_fact        41176 non-null  int64  
 8   day_of_week_fact  41176 non-null  int64  
 9   poutcome_fact     41176 non-null  int64  
 10  age               41176 non-null  int64  
 11  duration          41176 non-null  int64  
 12  campaign          41176 non-null  int64  
 13  pdays             41176 non-null  int64  
 14  previous          41176 non-null  int64  
 15  emp.var.rate      41176 non-null  float64
 16  cons.price.idx    41176 non-null  float6

Divide the new dataset into train and test, the scale all the data based on train variables

In [35]:
X = fact_data.drop(["y_fact"], axis= 1) #Independant variables
y = fact_data["y_fact"]# Dependant variables, target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state= 42) #Random state controls how many shuffles is applied to the data before spliting. It's the same number to replicate the same experiment


#Create the scaler
scaler = MinMaxScaler()

#Train the scaler with the train variables
scaler.fit(X_train)

#Transform train and test using the scaler
X_train_scaled = pd.DataFrame(scaler.transform(X_train), index= X_train.index, columns= X_train.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), index= X_test.index, columns= X_test.columns)

Feature selection with SelectKBest using a chi2 statistical method

In [36]:
#We create the selection model, in this case k is the number of features to be selected and chi2 the method used to determine wich are the best features to include
selection_model = SelectKBest(chi2, k = 5)

#Fit the model with both X_train and y_train
selection_model.fit(X_train_scaled, y_train)

#Gets an array of the features selected
ix = selection_model.get_support()

#With ix we convert that array information into two new datasets that contain the features selected
X_train_features = pd.DataFrame(selection_model.transform(X_train_scaled), columns = X_train_scaled.columns.values[ix])
X_test_features = pd.DataFrame(selection_model.transform(X_test_scaled), columns = X_test_scaled.columns.values[ix])


Save the new data to proceed with the exercise

In [37]:
X_train_features["y_fact"] = list(y_train)
X_test_features["y_fact"] = list(y_test)
X_train_features.to_csv("../data/processed/clean_train.csv", index = False)
X_test_features.to_csv("../data/processed/clean_test.csv", index = False)

Now we continue with the Logistic Regression Model. For that we need to read the data that we saved later and then divide it again into X and y, those are going to be our independant and dependant data

In [38]:
#Read the data saved
train_data = pd.read_csv("../data/processed/clean_train.csv")
test_data = pd.read_csv("../data/processed/clean_test.csv")

#Divide into train and test for each independant and dependant variables
X_train = train_data.drop(["y_fact"], axis = 1) #Independant 
y_train = train_data["y_fact"] #Dependant

X_test = test_data.drop(["y_fact"], axis = 1) #Independant 
y_test = test_data["y_fact"] #Dependant

After dividing the data, we train the model with it and then predict with it.

Create a function to check the metrics of the algorithm

In [39]:
def get_metrics(y_train, y_test, y_pred_train, y_pred_test):
    # Metrics for training dataset
    train_accuracy = accuracy_score(y_train, y_pred_train)
    train_f1 = f1_score(y_train, y_pred_train)
    train_auc = roc_auc_score(y_train, y_pred_train)
    train_precision = precision_score(y_train, y_pred_train)
    train_recall = recall_score(y_train, y_pred_train)
    train_specificity = specificity_score(y_train, y_pred_train)

    # Metrics for test dataset 
    test_accuracy = accuracy_score(y_test, y_pred_test)
    test_f1 = f1_score(y_test, y_pred_test)
    test_auc = roc_auc_score(y_test, y_pred_test)
    test_precision = precision_score(y_test, y_pred_test)
    test_recall = recall_score(y_test, y_pred_test)
    test_specificity = specificity_score(y_test, y_pred_test)

    # Calculate difference of train againts test dataset
    diff_accuracy = train_accuracy - test_accuracy
    diff_f1 = train_f1 - test_f1
    diff_auc = train_auc - test_auc
    diff_precision = train_precision - test_precision
    diff_recall = train_recall - test_recall
    diff_specificity = train_specificity - test_specificity

    # Crear un DataFrame con los resultados
    metrics_df = pd.DataFrame([[train_accuracy, train_f1, train_auc, train_precision, train_recall, train_specificity],[test_accuracy, test_f1, test_auc, test_precision, test_recall, test_specificity],[diff_accuracy, diff_f1, diff_auc, diff_precision, diff_recall, diff_specificity]],
                              columns = ['Accuracy', 'F1', 'AUC', 'Precision', 'Recall', 'Specificity'],
                              index = ['Train','Test', 'Differs'])

    return metrics_df

In [40]:
#Using simple filter to disguise convergence warnings
simplefilter("ignore")

#Create the logistic regression model
model = LogisticRegression()

#Train the logistic regression model
model.fit(X_train, y_train)

#Predict using the trained model and use the metrics function to retrieve valuable information
#Test variables
y_pred_test = model.predict(X_test)

#Train variables
y_pred_train = model.predict(X_train)

get_metrics(y_train, y_test, y_pred_train, y_pred_test)

Unnamed: 0,Accuracy,F1,AUC,Precision,Recall,Specificity
Train,0.9051,0.418959,0.643633,0.658294,0.307252,0.980015
Test,0.898373,0.405962,0.63681,0.652968,0.294542,0.979078
Differs,0.006727,0.012997,0.006824,0.005326,0.01271,0.000937


Now to optimize the model we will use GridSearch

In [41]:
# We define the parameters that we want to adjust
hyperparams = {
    "C": [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    "penalty": ["l1", "l2", "elasticnet", "none"],
    "solver": ["newton-cg", "lbfgs", "liblinear", "sag", "saga"]
}

# We initialize the GridSearch
grid = GridSearchCV(model, hyperparams, scoring = "accuracy", cv = 10)

#Fit the GridSearch with train data
grid.fit(X_train, y_train)

#Print the best parameters
print(grid.best_params_)

{'C': 1000, 'penalty': 'l2', 'solver': 'lbfgs'}


In [42]:
#Create the logistic regression model again, this time with the hyperparameters used for optimization
model = LogisticRegression(C= 1000, penalty= 'l2', solver= 'lbfgs')

#Train the logistic regression model
model.fit(X_train, y_train)

#Predict using the trained model and use the metrics function to retrieve valuable information
#Test variables
y_pred_test = model.predict(X_test)

#Train variables
y_pred_train = model.predict(X_train)

get_metrics(y_train, y_test, y_pred_train, y_pred_test)

Unnamed: 0,Accuracy,F1,AUC,Precision,Recall,Specificity
Train,0.90595,0.447378,0.659254,0.647059,0.341876,0.976633
Test,0.899223,0.430727,0.649783,0.644764,0.323378,0.976187
Differs,0.006727,0.016651,0.009472,0.002295,0.018498,0.000446


The model accuracy is improved.

Now we use pickle to dump our model to the models folder

In [43]:
dump(model, open("../models/logistic_regression_C-0.1_penalty-l2_solver-liblinear_42.sav", "wb"))