In [None]:
# Goal is to create a logistic regression model, optimizing, then saving the model.
# Find features that attribute the most to customers signing up for long-term deposits
import pandas as pd
# Load data and separate
total_data = pd.read_csv("bank-marketing-campaign-data.csv", sep=";")
total_data.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [None]:
# Remove any duplicate data
total_data = total_data.drop_duplicates().reset_index(drop = True)
total_data.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [None]:
from sklearn.preprocessing import MinMaxScaler

# Turn any non-numerical data to numerical
total_data["job_num"] = pd.factorize(total_data["job"])[0]
total_data["marital_num"] = pd.factorize(total_data["marital"])[0]
total_data["education_num"] = pd.factorize(total_data["education"])[0]
total_data["default_num"] = pd.factorize(total_data["default"])[0]
total_data["housing_num"] = pd.factorize(total_data["housing"])[0]
total_data["loan_num"] = pd.factorize(total_data["loan"])[0]
total_data["contact_num"] = pd.factorize(total_data["contact"])[0]
total_data["month_num"] = pd.factorize(total_data["month"])[0]
total_data["day_of_week_num"] = pd.factorize(total_data["day_of_week"])[0]
total_data["poutcome_num"] = pd.factorize(total_data["poutcome"])[0]
total_data["y_num"] = pd.factorize(total_data["y"])[0]

# Column labels
num_variables = ["job_num", "marital_num", "education_num", "default_num", "housing_num", "loan_num", "contact_num", "month_num", "day_of_week_num", "poutcome_num",
                 "age", "duration", "campaign", "pdays", "previous", "emp.var.rate", "cons.price.idx", "cons.conf.idx", "euribor3m", "nr.employed", "y_num"]

# Scale the numerical data to values between 0-1.
scaler = MinMaxScaler()
scal_features = scaler.fit_transform(total_data[num_variables])

# Create dataframe with new data
total_data_scaled = pd.DataFrame(scal_features, index = total_data.index, columns = num_variables)
total_data_scaled.head()

Unnamed: 0,job_num,marital_num,education_num,default_num,housing_num,loan_num,contact_num,month_num,day_of_week_num,poutcome_num,...,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y_num
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.05307,0.0,1.0,0.0,0.9375,0.698753,0.60251,0.957379,0.859735,0.0
1,0.090909,0.0,0.142857,0.5,0.0,0.0,0.0,0.0,0.0,0.0,...,0.030297,0.0,1.0,0.0,0.9375,0.698753,0.60251,0.957379,0.859735,0.0
2,0.090909,0.0,0.142857,0.0,0.5,0.0,0.0,0.0,0.0,0.0,...,0.045954,0.0,1.0,0.0,0.9375,0.698753,0.60251,0.957379,0.859735,0.0
3,0.181818,0.0,0.285714,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.030704,0.0,1.0,0.0,0.9375,0.698753,0.60251,0.957379,0.859735,0.0
4,0.090909,0.0,0.142857,0.0,0.0,0.5,0.0,0.0,0.0,0.0,...,0.062424,0.0,1.0,0.0,0.9375,0.698753,0.60251,0.957379,0.859735,0.0


In [None]:
from sklearn.feature_selection import chi2, SelectKBest
from sklearn.model_selection import train_test_split
# Set up x and y
x = total_data_scaled.drop("y_num", axis = 1)
y = total_data_scaled["y_num"]

# Split into training and test. 20% of data being in test
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state=42)

# Select 5 strongest correlated data
selection_model = SelectKBest(chi2, k = 5)
selection_model.fit(x_train, y_train)
select_best = selection_model.get_support()

# Clip the training data into the 5 selected features
x_train_selected = pd.DataFrame(selection_model.transform(x_train), columns = x_train.columns.values[select_best])
x_test_selected = pd.DataFrame(selection_model.transform(x_test), columns = x_test.columns.values[select_best])

x_test_selected.head()


Unnamed: 0,poutcome_num,previous,emp.var.rate,euribor3m,nr.employed
0,0.5,0.142857,0.333333,0.150759,0.512287
1,0.0,0.0,0.9375,0.958059,0.859735
2,0.5,0.142857,0.333333,0.153933,0.512287
3,0.0,0.0,1.0,0.982317,1.0
4,0.0,0.0,1.0,0.98141,1.0


In [None]:
from sklearn.linear_model import LogisticRegression
# Create the logistic regression model and fit off the training data
model = LogisticRegression()
model.fit(x_train_selected, y_train)


0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [None]:
# Predict the y off of the test data
predict_y = model.predict(x_test_selected)

predict_y


array([0., 0., 0., ..., 0., 0., 0.], shape=(8236,))

In [None]:
from sklearn.metrics import accuracy_score
# Check accuracy
accuracy_score(y_test, predict_y)

0.8886595434677028

In [None]:
from sklearn.model_selection import GridSearchCV

# Set hyperparams for fine tuning of data
hyperparams = {
    "C": [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    "penalty": ["l1", "l2", "elasticnet", None],
    "solver": ["newton-cg", "lbfgs", "liblinear", "sag", "saga"]
}

# Set up the gridsearch
grid = GridSearchCV(model, hyperparams, scoring = "accuracy", cv = 10)
grid

0,1,2
,estimator,LogisticRegression()
,param_grid,"{'C': [0.001, 0.01, ...], 'penalty': ['l1', 'l2', ...], 'solver': ['newton-cg', 'lbfgs', ...]}"
,scoring,'accuracy'
,n_jobs,
,refit,True
,cv,10
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [None]:
# Fit the grid to the training data
grid.fit(x_train_selected, y_train)

print(f"Best hyperparameters: {grid.best_params_}")



Best hyperparameters: {'C': 0.1, 'penalty': 'l2', 'solver': 'newton-cg'}


630 fits failed out of a total of 1400.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
70 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Yench\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\model_selection\_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Yench\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\base.py", line 1365, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Yench\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1218, in fit
    solver = _check_solver(

In [None]:
# Use the optimized parameters
model = LogisticRegression(C = 0.1, penalty = "l2", solver = "liblinear")
model.fit(x_train_selected, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,0.1
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'liblinear'
,max_iter,100


In [None]:
# Predict test y
predict_y = model.predict(x_test_selected)
predict_y

array([0., 0., 0., ..., 0., 0., 0.], shape=(8236,))

In [None]:
# Check accuracy
accuracy_score(y_test, predict_y)

0.8901165614375911

In [None]:
from pickle import dump

# Save the model
dump(model, open("logistic_regression_C-0.1_penalty-l2_solver-liblinear_42.sav", "wb"))