In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('data/customers.csv', sep=';')
df = df.drop(columns=['duration'])
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,1,-1,0,unknown,no


In [3]:
X = df.iloc[:, :-1].values
X

array([[58, 'management', 'married', ..., -1, 0, 'unknown'],
       [44, 'technician', 'single', ..., -1, 0, 'unknown'],
       [33, 'entrepreneur', 'married', ..., -1, 0, 'unknown'],
       ...,
       [72, 'retired', 'married', ..., 184, 3, 'success'],
       [57, 'blue-collar', 'married', ..., -1, 0, 'unknown'],
       [37, 'entrepreneur', 'married', ..., 188, 11, 'other']],
      dtype=object)

In [4]:
y = df.iloc[:, -1].values
y

array(['no', 'no', 'no', ..., 'yes', 'no', 'no'], dtype=object)

How do we want to encode this?

One-hot encoding:
- job
- marital
- contact
- outcome

Ordinal encoding:
- education
- default
- housing
- loan
- month
- y

No encoding:
- age
- balance
- day
- campaign
- pdays
- previous

In [9]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder

nominal = [1, 2, 8, 14]
ordinal = [3, 4, 6, 7, 10]

ct = ColumnTransformer(
    transformers=[
        ('onehot_encoder', OneHotEncoder(), nominal),
        ('ordinal_encoder', OrdinalEncoder(), ordinal)
    ],
    remainder='passthrough'
)

X = np.array(ct.fit_transform(X))
X
pd.DataFrame(X).to_csv('data/x_encoded.csv')
# os.system('open x_oh_encoded.csv')

In [10]:
from sklearn.preprocessing import LabelEncoder

lab_enc = LabelEncoder()
y = lab_enc.fit_transform(y)
y

array([0, 0, 0, ..., 1, 0, 0])

In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [16]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train[:, -7:] = sc.fit_transform(X_train[:, -7:])
X_train

array([[1.0, 0.0, 0.0, ..., -0.24745481214336104, -0.4100383077247772,
        -0.2425228949874718],
       [1.0, 0.0, 0.0, ..., -0.5662418342484753, -0.4100383077247772,
        -0.2425228949874718],
       [0.0, 1.0, 0.0, ..., -0.5662418342484753, 2.2114757997075096,
        1.0119321842765145],
       ...,
       [1.0, 0.0, 0.0, ..., 1.027693276277096, -0.4100383077247772,
        -0.2425228949874718],
       [1.0, 0.0, 0.0, ..., 2.6216283868026675, -0.4100383077247772,
        -0.2425228949874718],
       [0.0, 1.0, 0.0, ..., -0.24745481214336104, 2.9619092274075913,
        0.593780491188519]], dtype=object)

In [17]:
X_test[:, -7:] = sc.transform(X_test[:, -7:])
X_test

array([[0.0, 1.0, 0.0, ..., 0.3901192320668675, -0.4100383077247772,
        -0.2425228949874718],
       [1.0, 0.0, 0.0, ..., -0.24745481214336104, -0.4100383077247772,
        -0.2425228949874718],
       [1.0, 0.0, 0.0, ..., 0.3901192320668675, -0.4100383077247772,
        -0.2425228949874718],
       ...,
       [1.0, 0.0, 0.0, ..., 1.027693276277096, -0.4100383077247772,
        -0.2425228949874718],
       [1.0, 0.0, 0.0, ..., 3.5779894531180103, -0.4100383077247772,
        -0.2425228949874718],
       [1.0, 0.0, 0.0, ..., 0.3901192320668675, -0.4100383077247772,
        -0.2425228949874718]], dtype=object)

In [37]:
pd.DataFrame(X_train).to_csv('data/train/x_train.csv', index=False)
pd.DataFrame(X_test).to_csv('data/test/x_test.csv', index=False)

pd.DataFrame(y_train).to_csv('data/train/y_train.csv', index=False)
pd.DataFrame(y_test).to_csv('data/test/y_test.csv', index=False)