In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('../bank-full.csv', sep = ";")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        45211 non-null  int64 
 1   job        45211 non-null  object
 2   marital    45211 non-null  object
 3   education  45211 non-null  object
 4   default    45211 non-null  object
 5   balance    45211 non-null  int64 
 6   housing    45211 non-null  object
 7   loan       45211 non-null  object
 8   contact    45211 non-null  object
 9   day        45211 non-null  int64 
 10  month      45211 non-null  object
 11  duration   45211 non-null  int64 
 12  campaign   45211 non-null  int64 
 13  pdays      45211 non-null  int64 
 14  previous   45211 non-null  int64 
 15  poutcome   45211 non-null  object
 16  y          45211 non-null  object
dtypes: int64(7), object(10)
memory usage: 5.9+ MB


In [3]:
df_selected = df.drop(['default', 'loan'], axis=1)
df_selected.columns

Index(['age', 'job', 'marital', 'education', 'balance', 'housing', 'contact',
       'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome',
       'y'],
      dtype='object')

In [4]:
df_selected.isnull().sum()

age          0
job          0
marital      0
education    0
balance      0
housing      0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

In [5]:
# split the datas
from sklearn.model_selection import train_test_split
df_full_train, df_test = train_test_split(df_selected, test_size=0.20, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)
len(df_selected), len(df_train), len(df_test), len(df_val)

(45211, 27126, 9043, 9042)

In [6]:
# encode the y variable for each split

y_train = (df_train.y == 'yes').astype(int)
y_val = (df_val.y == 'yes').astype(int)
y_test = (df_test.y == 'yes').astype(int)


In [7]:
y_train.value_counts(), y_val.value_counts(), y_test.value_counts()

(y
 0    23985
 1     3141
 Name: count, dtype: int64,
 y
 0    7944
 1    1098
 Name: count, dtype: int64,
 y
 0    7993
 1    1050
 Name: count, dtype: int64)

In [8]:
# del y column from the dataset
del df_train['y']
del df_val['y']
del df_test['y']



In [9]:
train_numerical = df_train.select_dtypes(include=['int64', 'float64']).columns
train_numerical

Index(['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous'], dtype='object')

In [None]:
# prepare the features for training the model
from sklearn.feature_extraction import DictVectorizer

# convert dataframe to dict
train_dic = df_train.to_dict(orient = 'records')
val_dic = df_val.to_dict(orient = 'records')
dv = DictVectorizer(sparse=False)

# transform to get the X vairable
X_train = dv.fit_transform(train_dic)
X_val = dv.transform(val_dic)
len(X_train), len(X_val)

(27126, 9042)

In [11]:
#  train model
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000)
model.fit(X_train, y_train)

In [4]:
import pickle as pk

output_file = f"model_C=1.0.bin"
output_file


'model_C=1.0.bin'

In [None]:
# first method
f_out = open(output_file,"wb")
pk.dump((dv, model), f_out)
f_out.close()

In [None]:
# save the model
with open(output_file, "wb") as f_out:
    pk.dump((dv,model), f_out)

In [5]:
# load the model
with open(output_file, 'rb') as f_in:
    dv, model= pk.load(f_in)

In [9]:
model

In [7]:
dv