In [5]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression

In [7]:
data = pd.read_csv('/Users/carolchen/Desktop/bank-full.csv', delimiter=';')

In [8]:
data

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,technician,married,tertiary,no,825,no,no,cellular,17,nov,977,3,-1,0,unknown,yes
45207,71,retired,divorced,primary,no,1729,no,no,cellular,17,nov,456,2,-1,0,unknown,yes
45208,72,retired,married,secondary,no,5715,no,no,cellular,17,nov,1127,5,184,3,success,yes
45209,57,blue-collar,married,secondary,no,668,no,no,telephone,17,nov,508,4,-1,0,unknown,no


In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        45211 non-null  int64 
 1   job        45211 non-null  object
 2   marital    45211 non-null  object
 3   education  45211 non-null  object
 4   default    45211 non-null  object
 5   balance    45211 non-null  int64 
 6   housing    45211 non-null  object
 7   loan       45211 non-null  object
 8   contact    45211 non-null  object
 9   day        45211 non-null  int64 
 10  month      45211 non-null  object
 11  duration   45211 non-null  int64 
 12  campaign   45211 non-null  int64 
 13  pdays      45211 non-null  int64 
 14  previous   45211 non-null  int64 
 15  poutcome   45211 non-null  object
 16  y          45211 non-null  object
dtypes: int64(7), object(10)
memory usage: 5.9+ MB


In [10]:
y = data['y']
X = data.drop('y', axis=1)

In [11]:
def get_categorical_features(df):
    return [feature for feature in df.columns if df[feature].dtype == 'object']

In [12]:
get_categorical_features(X)

['job',
 'marital',
 'education',
 'default',
 'housing',
 'loan',
 'contact',
 'month',
 'poutcome']

In [13]:
def get_uniques(df, columns):
    return {column: list(df[column].unique()) for column in columns}

In [14]:
get_uniques(X, get_categorical_features(X))

{'job': ['management',
  'technician',
  'entrepreneur',
  'blue-collar',
  'unknown',
  'retired',
  'admin.',
  'services',
  'self-employed',
  'unemployed',
  'housemaid',
  'student'],
 'marital': ['married', 'single', 'divorced'],
 'education': ['tertiary', 'secondary', 'unknown', 'primary'],
 'default': ['no', 'yes'],
 'housing': ['yes', 'no'],
 'loan': ['no', 'yes'],
 'contact': ['unknown', 'cellular', 'telephone'],
 'month': ['may',
  'jun',
  'jul',
  'aug',
  'oct',
  'nov',
  'dec',
  'jan',
  'feb',
  'mar',
  'apr',
  'sep'],
 'poutcome': ['unknown', 'failure', 'other', 'success']}

In [16]:
X = X.replace('unknown', np.NaN)
X.isna().sum()

age              0
job            288
marital          0
education     1857
default          0
balance          0
housing          0
loan             0
contact      13020
day              0
month            0
duration         0
campaign         0
pdays            0
previous         0
poutcome     36959
dtype: int64

In [17]:
X.drop('poutcome', axis=1, inplace=True)

In [18]:
get_uniques(X, get_categorical_features(X))

{'job': ['management',
  'technician',
  'entrepreneur',
  'blue-collar',
  nan,
  'retired',
  'admin.',
  'services',
  'self-employed',
  'unemployed',
  'housemaid',
  'student'],
 'marital': ['married', 'single', 'divorced'],
 'education': ['tertiary', 'secondary', nan, 'primary'],
 'default': ['no', 'yes'],
 'housing': ['yes', 'no'],
 'loan': ['no', 'yes'],
 'contact': [nan, 'cellular', 'telephone'],
 'month': ['may',
  'jun',
  'jul',
  'aug',
  'oct',
  'nov',
  'dec',
  'jan',
  'feb',
  'mar',
  'apr',
  'sep']}

In [19]:
binary_features = ['default', 'housing', 'loan']

ordinal_features = ['education', 'month']

nominal_features = ['job', 'marital', 'contact']

In [20]:
def binary_encode(df, columns, positive_label):
    df = df.copy()
    for column in columns:
        df[column] = df[column].apply(lambda x: 1 if x == positive_label else 0)
    return df

In [None]:
X = binary_encode(X, binary_features, 'yes')

In [None]:
def ordinal_encode(df, columns, orderings):
    df = df.copy()
    for column, ordering in zip(columns, orderings):
        df[column] = df[column].apply(lambda x: ordering.index(x) if str(x) != 'nan' else x)
    return df

In [None]:
education_ordering = ['primary', 'secondary', 'tertiary']

month_ordering = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec']

orderings = [education_ordering, month_ordering]


X = ordinal_encode(X, ordinal_features, orderings)

In [None]:
def onehot_encode(df, columns):
    df = df.copy()
    for column in columns:
        dummies = pd.get_dummies(df[column])
        df = pd.concat([df, dummies], axis=1)
        df.drop(column, axis=1, inplace=True)
    return df

In [None]:
X = onehot_encode(X, nominal_features)

In [21]:
X 

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous
0,58,management,married,tertiary,no,2143,yes,no,,5,may,261,1,-1,0
1,44,technician,single,secondary,no,29,yes,no,,5,may,151,1,-1,0
2,33,entrepreneur,married,secondary,no,2,yes,yes,,5,may,76,1,-1,0
3,47,blue-collar,married,,no,1506,yes,no,,5,may,92,1,-1,0
4,33,,single,,no,1,no,no,,5,may,198,1,-1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,technician,married,tertiary,no,825,no,no,cellular,17,nov,977,3,-1,0
45207,71,retired,divorced,primary,no,1729,no,no,cellular,17,nov,456,2,-1,0
45208,72,retired,married,secondary,no,5715,no,no,cellular,17,nov,1127,5,184,3
45209,57,blue-collar,married,secondary,no,668,no,no,telephone,17,nov,508,4,-1,0


In [25]:
X.isna().sum()

age              0
job            288
marital          0
education     1857
default          0
balance          0
housing          0
loan             0
contact      13020
day              0
month            0
duration         0
campaign         0
pdays            0
previous         0
dtype: int64

In [28]:
#X['education'] = X['education'].fillna(X['education'].median())

In [29]:
scaler = StandardScaler()

X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

ValueError: could not convert string to float: 'management'