In [121]:
import pandas as pd
import numpy as np
import os

from sklearn.preprocessing import OrdinalEncoder

### Upload Data

In [122]:
path = os.getcwd()
train = pd.read_csv(f"{path}\\train.csv", sep=';')
test = pd.read_csv(f"{path}\\test.csv", sep=';')
data = pd.concat([train, test])
data.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [123]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 49732 entries, 0 to 4520
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        49732 non-null  int64 
 1   job        49732 non-null  object
 2   marital    49732 non-null  object
 3   education  49732 non-null  object
 4   default    49732 non-null  object
 5   balance    49732 non-null  int64 
 6   housing    49732 non-null  object
 7   loan       49732 non-null  object
 8   contact    49732 non-null  object
 9   day        49732 non-null  int64 
 10  month      49732 non-null  object
 11  duration   49732 non-null  int64 
 12  campaign   49732 non-null  int64 
 13  pdays      49732 non-null  int64 
 14  previous   49732 non-null  int64 
 15  poutcome   49732 non-null  object
 16  y          49732 non-null  object
dtypes: int64(7), object(10)
memory usage: 6.8+ MB


In [124]:
for column in data.columns:
    print(f"--------{column}---------")
    unique_values = data[column].unique()
    print(unique_values)

--------age---------
[58 44 33 47 35 28 42 43 41 29 53 57 51 45 60 56 32 25 40 39 52 46 36 49
 59 37 50 54 55 48 24 38 31 30 27 34 23 26 61 22 21 20 66 62 83 75 67 70
 65 68 64 69 72 71 19 76 85 63 90 82 73 74 78 80 94 79 77 86 95 81 18 89
 84 87 92 93 88]
--------job---------
['management' 'technician' 'entrepreneur' 'blue-collar' 'unknown'
 'retired' 'admin.' 'services' 'self-employed' 'unemployed' 'housemaid'
 'student']
--------marital---------
['married' 'single' 'divorced']
--------education---------
['tertiary' 'secondary' 'unknown' 'primary']
--------default---------
['no' 'yes']
--------balance---------
[ 2143    29     2 ...  8205 14204 16353]
--------housing---------
['yes' 'no']
--------loan---------
['no' 'yes']
--------contact---------
['unknown' 'cellular' 'telephone']
--------day---------
[ 5  6  7  8  9 12 13 14 15 16 19 20 21 23 26 27 28 29 30  2  3  4 11 17
 18 24 25  1 10 22 31]
--------month---------
['may' 'jun' 'jul' 'aug' 'oct' 'nov' 'dec' 'jan' 'feb' 'mar' 'apr

In [125]:
'''
unknown in job
unknown in education
unknown in poutcome
'''
s = (data.dtypes == 'object')
object_cols = list(s[s].index)
print("Categorical variables:")
print(object_cols)

print("\nCategorical variables with 'unknown':")
cols_with_unknown = ['job', 'education', 'poutcome']
print(cols_with_unknown)

print("\nCategorical variables without 'unknown'")
cols_without_unknown = [col for col in object_cols if col not in cols_with_unknown]
print(cols_without_unknown)

Categorical variables:
['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome', 'y']

Categorical variables with 'unknown':
['job', 'education', 'poutcome']

Categorical variables without 'unknown'
['marital', 'default', 'housing', 'loan', 'contact', 'month', 'y']


### 'yes' - 'no' Binary Values

In [126]:
# 'yes' - 'no' binary values
data['default'] = data['default'].replace({'yes':1, 'no':0})
data['housing'] = data['housing'].replace({'yes':1, 'no':0})
data['loan'] = data['loan'].replace({'yes':1, 'no':0})
data['y'] = data['y'].replace({'yes':1, 'no':0})

### 'education'

In [127]:
data.loc[(data.education == "unknown") & (data.job == "admin."),"education"] = "secondary"
data.loc[(data.education == "unknown") & (data.job == "management"),"education"] = "secondary"
data.loc[(data.education == "unknown") & (data.job == "services"),"education"] = "tertiary"
data.loc[(data.education == "unknown") & (data.job == "technician."),"education"] = "secondary"
data.loc[(data.education == "unknown") & (data.job == "retired"),"education"] = "secondary"
data.loc[(data.education == "unknown") & (data.job == "blue-collar"),"education"] = "secondary"
data.loc[(data.education == "unknown") & (data.job == "housemaid."),"education"] = "primary"
data.loc[(data.education == "unknown") & (data.job == "self-employed"),"education"] = "tertiary"
data.loc[(data.education == "unknown") & (data.job == "student"),"education"] = "secondary"
data.loc[(data.education == "unknown") & (data.job == "entrepreneur"),"education"] = "tertiary"
data.loc[(data.education == "unknown") & (data.job == "unemployed"),"education"] = "secondary"
#REST CAN BE SECONDARY
data.loc[(data.education == "unknown"),"education"] = "secondary"
data.education.unique()

array(['tertiary', 'secondary', 'primary'], dtype=object)

In [128]:
data['education'] = data['education'].replace({'primary':1, 'secondary':2, 'tertiary':3})

### 'contact'

In [129]:
data.loc[(data.contact == "unknown") & (data.y == 1),"contact"] = "telephone"
data.loc[(data.contact == "unknown") & (data.y == 0),"contact"] = "cellular"
data.contact.unique()

array(['cellular', 'telephone'], dtype=object)

### 'month'

In [130]:
month_categories = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec']
for i, mon in enumerate(month_categories):
    data['month'] = data['month'].replace({mon:i+1})

### Rest Categorical Var. 

In [131]:
s = (data.dtypes == 'object')
object_cols = list(s[s].index)
print("Categorical variables:")
print(object_cols)

Categorical variables:
['job', 'marital', 'contact', 'poutcome']


In [132]:
encoder = OrdinalEncoder(dtype=np.int64)
data[object_cols] = encoder.fit_transform(data[object_cols])

In [133]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 49732 entries, 0 to 4520
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   age        49732 non-null  int64
 1   job        49732 non-null  int64
 2   marital    49732 non-null  int64
 3   education  49732 non-null  int64
 4   default    49732 non-null  int64
 5   balance    49732 non-null  int64
 6   housing    49732 non-null  int64
 7   loan       49732 non-null  int64
 8   contact    49732 non-null  int64
 9   day        49732 non-null  int64
 10  month      49732 non-null  int64
 11  duration   49732 non-null  int64
 12  campaign   49732 non-null  int64
 13  pdays      49732 non-null  int64
 14  previous   49732 non-null  int64
 15  poutcome   49732 non-null  int64
 16  y          49732 non-null  int64
dtypes: int64(17)
memory usage: 6.8 MB


# Model

In [134]:
import torch

In [135]:
y = data['y']
X = data.drop(columns = ['y'])

y = y.to_numpy()
X = X.to_numpy()

y = torch.tensor(y, dtype=torch.int64)
X = torch.tensor(X, dtype=torch.int64)

In [148]:
X.size()

torch.Size([49732, 16])

In [ ]:
import torch.nn as nn
import torch.nn.functional as F

In [151]:
class Net(nn.Module):
    def __init__(self):
        super(Net,self).__init_()
        self.fc1 = nn.Linear(16, 160)
        self.fc2 = nn.Linear(160, 80)
        self.fc3 = nn.Linear(80, 40) 
        self.fc3 = nn.Linear(40, 1) ###########neden 1, # of classes=2? dimension?
        self.relu = F.relu()
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.sigmoid(self.fc3(x))