In [1]:
import pandas as pd
import numpy as np
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [2]:
#load the dev data so that you can see how a baseline model works on a sample data

dev_data = np.load("../data/interim/dev_data.npy", allow_pickle=True)
dev_data

array([[  46, 1252,   17, ...,    0,    0,    1],
       [  39, 8152,   20, ...,    0,    0,    1],
       [  50,    0,   28, ...,    0,    0,    1],
       ...,
       [  59, 4949,   12, ...,    0,    0,    1],
       [  35,    2,    3, ...,    0,    0,    1],
       [  29,  444,   21, ...,    0,    0,    1]])

In [3]:
# load the data and get the columns (get columns after converting it into dummies)
# df = pd.read_csv("../data/raw/bank-full.csv")
columns = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous',
       'target', 'job_admin.', 'job_blue-collar', 'job_entrepreneur',
       'job_housemaid', 'job_management', 'job_retired', 'job_self-employed',
       'job_services', 'job_student', 'job_technician', 'job_unemployed',
       'job_unknown', 'marital_divorced', 'marital_married', 'marital_single',
       'education_primary', 'education_secondary', 'education_tertiary',
       'education_unknown', 'credit_default_no', 'credit_default_yes',
       'housing_no', 'housing_yes', 'personal_loan_no', 'personal_loan_yes',
       'contact_type_cellular', 'contact_type_telephone',
       'contact_type_unknown', 'month_apr', 'month_aug', 'month_dec',
       'month_feb', 'month_jan', 'month_jul', 'month_jun', 'month_mar',
       'month_may', 'month_nov', 'month_oct', 'month_sep',
       'previous_campaign_failure', 'previous_campaign_other',
       'previous_campaign_success', 'previous_campaign_unknown']

In [4]:
# replace columns and form a dataframe (convert numpy array to a dataframe)
dev_data = pd.DataFrame(dev_data, columns= columns)
dev_data.head()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous,target,job_admin.,job_blue-collar,...,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,previous_campaign_failure,previous_campaign_other,previous_campaign_success,previous_campaign_unknown
0,46,1252,17,59,1,-1,0,0,1,0,...,0,0,0,1,0,0,0,0,0,1
1,39,8152,20,80,1,-1,0,0,0,1,...,0,0,0,1,0,0,0,0,0,1
2,50,0,28,39,4,-1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,37,215,6,308,3,-1,0,0,0,1,...,0,0,1,0,0,0,0,0,0,1
4,39,281,20,78,7,-1,0,0,0,1,...,1,0,0,0,0,0,0,0,0,1


In [5]:
# Always start with building stupid models and see what they bring up and then introduce machine learning to see how ML helps in building a better model
# If you want to solve problems and build products always start with a stupid model
# experimenting with base modles is quick and low cost 
# Reference " https://blog.insightdatascience.com/always-start-with-a-stupid-model-no-exceptions-3a22314b9aaa"


In [6]:
#create target vector and feature matrix
x, y = dev_data.drop(columns='target', axis=1), dev_data.target

#split data into train and test set
X_train, X_test, y_train, y_test = train_test_split(x, y, random_state=0)
dummy = DummyClassifier(strategy='uniform', random_state=1)

# "Train" model
dummy.fit(X_train, y_train)

DummyClassifier(random_state=1, strategy='uniform')

In [7]:
#evaluate performance metric
dummy.score(X_test,y_test)

0.5137046861184792

In [8]:
#predict the data and print the classification report
y_pred = dummy.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.88      0.52      0.65       996
           1       0.12      0.47      0.19       135

    accuracy                           0.51      1131
   macro avg       0.50      0.50      0.42      1131
weighted avg       0.79      0.51      0.60      1131

