In [1]:
import pandas as pd
import numpy as np

In [2]:
train_size = 0.7
dev_size = 0.1
test_size = 0.2

In [3]:
df = pd.read_csv("../data/raw/bank-full.csv")
df = df.rename(columns={'default': 'credit_default', 'loan': 'personal_loan', 'contact': 'contact_type', 'poutcome': 'previous_campaign'})
columns = df.columns

In [4]:
#frac=1 tells us to re-sample our data (basically spilling up our rows and random_state to sample data randomly)
df = df.sample(frac=1, random_state=42)
df.dtypes

age                   int64
job                  object
marital              object
education            object
credit_default       object
balance               int64
housing              object
personal_loan        object
contact_type         object
day                   int64
month                object
duration              int64
campaign              int64
pdays                 int64
previous              int64
previous_campaign    object
target               object
dtype: object

In [5]:
#convert the target variables to 1's and 0's
clean_up_data = {"target": {"yes":1, "no":0}}
df.replace(clean_up_data, inplace=True)


In [6]:
df.dtypes

age                   int64
job                  object
marital              object
education            object
credit_default       object
balance               int64
housing              object
personal_loan        object
contact_type         object
day                   int64
month                object
duration              int64
campaign              int64
pdays                 int64
previous              int64
previous_campaign    object
target                int64
dtype: object

In [7]:
# apply one-hot encoding for categorical variables - use dummies in pandas
#https://www.analyticsvidhya.com/blog/2020/03/one-hot-encoding-vs-label-encoding-using-scikit-learn/

df_num = df.select_dtypes(include=[int])
df_obj = df.select_dtypes(include=[object])
# df_obj.head(5)
df_dummy = pd.get_dummies(data=df_obj, columns=df_obj.columns)
# df_dummy.dtypes
df = pd.concat([df_num,df_dummy], axis=1)

In [8]:

columns = df.columns
#store data into arrays
data_array = df.values
data_shape = data_array.shape
# data_array
columns
df.columns

Index(['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous',
       'target', 'job_admin.', 'job_blue-collar', 'job_entrepreneur',
       'job_housemaid', 'job_management', 'job_retired', 'job_self-employed',
       'job_services', 'job_student', 'job_technician', 'job_unemployed',
       'job_unknown', 'marital_divorced', 'marital_married', 'marital_single',
       'education_primary', 'education_secondary', 'education_tertiary',
       'education_unknown', 'credit_default_no', 'credit_default_yes',
       'housing_no', 'housing_yes', 'personal_loan_no', 'personal_loan_yes',
       'contact_type_cellular', 'contact_type_telephone',
       'contact_type_unknown', 'month_apr', 'month_aug', 'month_dec',
       'month_feb', 'month_jan', 'month_jul', 'month_jun', 'month_mar',
       'month_may', 'month_nov', 'month_oct', 'month_sep',
       'previous_campaign_failure', 'previous_campaign_other',
       'previous_campaign_success', 'previous_campaign_unknown'],
      dtype='

In [9]:
#split data based on the size
train_length = round(data_array.shape[0] * train_size)
dev_length = round(data_array.shape[0] * dev_size)
test_length = round(data_array.shape[0] * test_size)

#get the data into arrays
train_array = data_array[:train_length]
dev_array = data_array[train_length:(train_length + dev_length)]
test_array = data_array[(train_length+dev_length):]

In [10]:
np.save("../data/interim/train_data.npy", train_array)
np.save("../data/interim/dev_data.npy", dev_array)
np.save("../data/interim/test_data.npy", test_array)

In [11]:
#test if data is loaded correctly
sample_test = np.load("../data/interim/dev_data.npy", allow_pickle=True)
sample_test.shape

(4521, 52)