# Explore here

Imports

In [216]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import numpy as np
from pickle import dump



We need to perform an EDA before modeling, in this exercise I will cover only the basics since the algorithm is the main point.

Collect the data 

In [217]:
all_data = pd.read_csv("https://raw.githubusercontent.com/4GeeksAcademy/logistic-regression-project-tutorial/main/bank-marketing-campaign-data.csv", sep= ";")

all_data.head()
all_data.shape

(41188, 21)

Now delete all possible duplicates in this dataset, in case there are. Also, obtain the null values and information as well.

In [218]:
all_data = all_data.drop_duplicates().reset_index(drop= True)

all_data.info()
all_data.shape

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41176 entries, 0 to 41175
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             41176 non-null  int64  
 1   job             41176 non-null  object 
 2   marital         41176 non-null  object 
 3   education       41176 non-null  object 
 4   default         41176 non-null  object 
 5   housing         41176 non-null  object 
 6   loan            41176 non-null  object 
 7   contact         41176 non-null  object 
 8   month           41176 non-null  object 
 9   day_of_week     41176 non-null  object 
 10  duration        41176 non-null  int64  
 11  campaign        41176 non-null  int64  
 12  pdays           41176 non-null  int64  
 13  previous        41176 non-null  int64  
 14  poutcome        41176 non-null  object 
 15  emp.var.rate    41176 non-null  float64
 16  cons.price.idx  41176 non-null  float64
 17  cons.conf.idx   41176 non-null 

(41176, 21)

There aren't any null values but some duplicates, in this case, 12 duplicates.

Also, with the .info method we can know that there aren't any null values in this dataset.

We need to also factorize all the categorical variables

In [219]:
#Factorize categorical variables in a new dataframe
all_data["job_fact"] = pd.factorize(all_data["job"])[0]
all_data["marital_fact"] = pd.factorize(all_data["marital"])[0]
all_data["education_fact"] = pd.factorize(all_data["education"])[0]
all_data["default_fact"] = pd.factorize(all_data["default"])[0]
all_data["housing_fact"] = pd.factorize(all_data["housing"])[0]
all_data["loan_fact"] = pd.factorize(all_data["loan"])[0]
all_data["contact_fact"] = pd.factorize(all_data["contact"])[0]
all_data["month_fact"] = pd.factorize(all_data["month"])[0]
all_data["day_of_week_fact"] = pd.factorize(all_data["day_of_week"])[0]
all_data["poutcome_fact"] = pd.factorize(all_data["poutcome"])[0]
all_data["y_fact"] = pd.factorize(all_data["y"])[0]

Scale all the data and divide the data into train and test

In [220]:
num_variables = ["job_fact", "marital_fact", "education_fact", "default_fact", "housing_fact", "loan_fact", "contact_fact", "month_fact", "day_of_week_fact", "poutcome_fact",
                 "age", "duration", "campaign", "pdays", "previous", "emp.var.rate", "cons.price.idx", "cons.conf.idx", "euribor3m", "nr.employed", "y_fact"]

scaler = MinMaxScaler()
scal_features = scaler.fit_transform(all_data[num_variables])
fact_data_scal = pd.DataFrame(scal_features, index = all_data.index, columns = num_variables)

X = fact_data_scal.drop(["y_fact"], axis= 1) #Independant variables
y = fact_data_scal["y_fact"]# Dependant variables, target

#Now using train_test_split from sklearn, we separate the variables. Ones are going to be used for training the algorithm and the others to test the algorithm
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state= 42) #Random state controls how many shuffles is applied to the data before spliting. It's the same number to replicate the same experiment

fact_data_scal.head()

Unnamed: 0,job_fact,marital_fact,education_fact,default_fact,housing_fact,loan_fact,contact_fact,month_fact,day_of_week_fact,poutcome_fact,...,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y_fact
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.05307,0.0,1.0,0.0,0.9375,0.698753,0.60251,0.957379,0.859735,0.0
1,0.090909,0.0,0.142857,0.5,0.0,0.0,0.0,0.0,0.0,0.0,...,0.030297,0.0,1.0,0.0,0.9375,0.698753,0.60251,0.957379,0.859735,0.0
2,0.090909,0.0,0.142857,0.0,0.5,0.0,0.0,0.0,0.0,0.0,...,0.045954,0.0,1.0,0.0,0.9375,0.698753,0.60251,0.957379,0.859735,0.0
3,0.181818,0.0,0.285714,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.030704,0.0,1.0,0.0,0.9375,0.698753,0.60251,0.957379,0.859735,0.0
4,0.090909,0.0,0.142857,0.0,0.0,0.5,0.0,0.0,0.0,0.0,...,0.062424,0.0,1.0,0.0,0.9375,0.698753,0.60251,0.957379,0.859735,0.0


Feature selection with SelectKBest using a chi2 statistical method

In [221]:
#We create the selection model, in this case k is the number of features to be selected and chi2 the method used to determine wich are the best features to include
selection_model = SelectKBest(chi2, k = 5)

#Fit the model with both X_train and y_train
selection_model.fit(X_train, y_train)

#Gets an array of the features selected
ix = selection_model.get_support()

#With ix we convert that array information into two new datasets that contain the features selected
X_train_features = pd.DataFrame(selection_model.transform(X_train), columns = X_train.columns.values[ix])
X_test_features = pd.DataFrame(selection_model.transform(X_test), columns = X_test.columns.values[ix])


Save the new data to proceed with the exercise

In [222]:
X_train_features["y_fact"] = list(y_train)
X_test_features["y_fact"] = list(y_test)
X_train_features.to_csv("../data/processed/clean_train.csv", index = False)
X_test_features.to_csv("../data/processed/clean_test.csv", index = False)

Now we continue with the Logistic Regression Model. For that we need to read the data that we saved later and then divide it again into X and y, those are going to be our independant and dependant data

In [223]:
#Read the data saved
train_data = pd.read_csv("../data/processed/clean_train.csv")
test_data = pd.read_csv("../data/processed/clean_test.csv")

#Divide into train and test for each independant and dependant variables
X_train = train_data.drop(["y_fact"], axis = 1) #Independant 
y_train = train_data["y_fact"] #Dependant

X_test = test_data.drop(["y_fact"], axis = 1) #Independant 
y_test = test_data["y_fact"] #Dependant

After dividing the data, we train the model with it and then predict with it.

In [224]:
#Create the logistic regression model
model = LogisticRegression()

#Train the logistic regression model
model.fit(X_train, y_train)

#Predict using the trained model and use the accuracy score to the determine it's precission
y_pred = model.predict(X_test)

accuracy_score(y_test, y_pred)

0.8886595434677028

Now to optimize the model we will use GridSearch. Tried to use RandomSearchCV but pops a lot of unexpected errors and I don't really know how to handle them

In [225]:
# We define the parameters that we want to adjust
hyperparams = {
    "C": [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    "penalty": ["l1", "l2", "elasticnet"],
    "solver": ["newton-cg", "lbfgs", "liblinear", "sag", "saga"]
}

# We initialize the GridSearch
grid = GridSearchCV(model, hyperparams, scoring = "accuracy", cv = 10)

#Fit the GridSearch with train data
grid.fit(X_train, y_train)

#Print the best parameters, will be {'C': 0.1, 'penalty': 'l2', 'solver': 'newton-cg'} in this case
print(grid.best_params_)

{'C': 0.1, 'penalty': 'l2', 'solver': 'newton-cg'}


In [226]:
#Create the logistic regression model again, this time with the hyperparameters used for optimization
model = LogisticRegression(C= 0.1, penalty= 'l2', solver= 'newton-cg')

#Train the logistic regression model
model.fit(X_train, y_train)

#Predict using the trained model and use the accuracy score to the determine it's precission
y_pred = model.predict(X_test)

accuracy_score(y_test, y_pred)

0.8901165614375911

The model accuracy goes from 0.8886595434677028 to 0.8901165614375911 so we succesfully optimize it.

Now we use pickle to dump our model to the models folder

In [227]:
dump(model, open("../models/logistic_regression_C-0.1_penalty-l2_solver-liblinear_42.sav", "wb"))