<h1 style="color:#ffc0cb;font-size:70px;font-family:Georgia;text-align:center;"><strong>Rule Learning</strong></h1>

### <b>Author: Nguyen Dang Huynh Chau</b>


# ✴️ Importing Necessary Libraries and datasets
****

In [1]:
# import libraries which are pandas and numpy
import pandas as pd
import numpy as np
from math import *


#for plots
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"]= 15,10

#Libraries for plotting
# Modules for data visualization
import seaborn as sns
sns.set_theme(style="ticks", color_codes=True) #set theme in seaborn
# scatter matrix library
from pandas.plotting import scatter_matrix

#Libraries for feature scaling
from sklearn.preprocessing import StandardScaler

#Libraries for Validation
from sklearn.utils.multiclass import unique_labels
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

In [2]:
# check the version of the packages
print("Numpy version: ", np.__version__)
print("Pandas version: ",pd.__version__)
! python --version

Numpy version:  1.20.3
Pandas version:  1.3.4
Python 3.9.7


# 📲 Data Retrieving
***

In [3]:
df = pd.read_csv("Data/bank-full.csv", delimiter=';', skipinitialspace = True)

df.columns = df.columns.str.replace(' ', '') #strip the extra-whitespaces out

print("The shape of the ORGINAL data is (row, column):", str(df.shape))

# drop Unnamed, it is just a number given to identify each house
df.head(3)

The shape of the ORGINAL data is (row, column): (45211, 17)


Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no


# 🔈 Data Information
****

In [4]:
print ("The shape of the train data is (row, column):"+ str(df.shape))
print (df.info())

The shape of the train data is (row, column):(45211, 17)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        45211 non-null  int64 
 1   job        45211 non-null  object
 2   marital    45211 non-null  object
 3   education  45211 non-null  object
 4   default    45211 non-null  object
 5   balance    45211 non-null  int64 
 6   housing    45211 non-null  object
 7   loan       45211 non-null  object
 8   contact    45211 non-null  object
 9   day        45211 non-null  int64 
 10  month      45211 non-null  object
 11  duration   45211 non-null  int64 
 12  campaign   45211 non-null  int64 
 13  pdays      45211 non-null  int64 
 14  previous   45211 non-null  int64 
 15  poutcome   45211 non-null  object
 16  y          45211 non-null  object
dtypes: int64(7), object(10)
memory usage: 5.9+ MB
None


# 🔈 One hot Encoding
****

In [5]:
import category_encoders as ce

encoder=ce.OneHotEncoder(cols=['job', 'marital', 'education','default', 'housing', 'loan', 'contact', 'month', 'poutcome'],
                         handle_unknown='return_nan',
                         return_df=True,use_cat_names=True)
#Fit and transform Data
df = encoder.fit_transform(df)
df

Unnamed: 0,age,job_management,job_technician,job_entrepreneur,job_blue-collar,job_unknown,job_retired,job_admin.,job_services,job_self-employed,...,month_sep,duration,campaign,pdays,previous,poutcome_unknown,poutcome_failure,poutcome_other,poutcome_success,y
0,58,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,261,1,-1,0,1.0,0.0,0.0,0.0,no
1,44,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,151,1,-1,0,1.0,0.0,0.0,0.0,no
2,33,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,76,1,-1,0,1.0,0.0,0.0,0.0,no
3,47,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,92,1,-1,0,1.0,0.0,0.0,0.0,no
4,33,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,198,1,-1,0,1.0,0.0,0.0,0.0,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,977,3,-1,0,1.0,0.0,0.0,0.0,yes
45207,71,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,456,2,-1,0,1.0,0.0,0.0,0.0,yes
45208,72,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1127,5,184,3,0.0,0.0,0.0,1.0,yes
45209,57,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,508,4,-1,0,1.0,0.0,0.0,0.0,no


In [6]:
df.loc[df['y'].isin(['no']), 'y'] = '0'
df.loc[df['y'].isin(['yes']), 'y'] = '1'

# Splitting the training data
****

In [7]:
# separating our independent and dependent variable
X = df.drop(['y'], axis = 1)
#Target variable in y
y = df["y"]

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size = .2, random_state=42)

print("Length of X_train: " + str(len(X_train)))
print("Length of X_test: " + str(len(X_test)))

Length of X_train: 36168
Length of X_test: 9043


# 🔈 Class balancing
****

In [9]:
df['y'].value_counts()

0    39922
1     5289
Name: y, dtype: int64

In [10]:
from imblearn.over_sampling import SMOTE
from collections import Counter
print(Counter(y))
sm = SMOTE(random_state=42, sampling_strategy=0.9)
X, y = sm.fit_resample(X, y)
print(Counter(y))

Counter({'0': 39922, '1': 5289})
Counter({'0': 39922, '1': 35929})


In [11]:
from imblearn.over_sampling import RandomOverSampler
# define oversampling strategy
oversample = RandomOverSampler(random_state=42, sampling_strategy='minority')
print(Counter(y))
# fit and apply the transform
X, y = oversample.fit_resample(X, y)
# summarize class distribution

print(Counter(y))

Counter({'0': 39922, '1': 35929})
Counter({'0': 39922, '1': 39922})


# 🔈 Data Scaling
****

In [12]:
# Feature Scaling
## We will be using RobustScaler to transform
from sklearn.preprocessing import RobustScaler

scale = RobustScaler()

## transforming "train_x"
X_train = scale.fit_transform(X_train)
## transforming "test_x"
X_test = scale.fit_transform(X_test)

In [13]:
import math

def entropy(data, target):
    vCounts = pd.value_counts(data[target])
    dataSize = data.shape[0] # or data[targetClass].size
    entropy_sum = 0
    for value in vCounts:
        proportion = value / dataSize
        entropy_sum = entropy_sum - (proportion * math.log(proportion, 2))
    return entropy_sum

def majority_class(data, target):
    majority = 0
    cl = ''
    vCounts = pd.value_counts(data[target])
    for value in vCounts.axes[0]:
        count = vCounts[value]
        if count > majority:
            majority = count
            cl = value
    return cl
        
def majority_class(data, target):
    counts = pd.value_counts(data[target])
    max_name = counts.idxmax()
    return max_name


def simpler_rule_learner(data, target):
    all_attributes = set(data.columns)
    all_attributes = all_attributes.difference([target,])
    
    while data.shape[0] > 0:
        if entropy(data, target) == 0:
            print("otherwise =>", majority_class(data,target))
            data = data.iloc[0:0]
        else:
            best_entropy = entropy(data, target)
            best_attribute = ''
            best_value = ''
            best_data = data
            
            for attribute in all_attributes:
                vCounts = pd.value_counts(data[attribute])
                for value in vCounts.axes[0]:
                    data2 = data.loc[data[attribute] == value]
                    if entropy(data2, target) < best_entropy:
                        best_entropy = entropy(data2, target)
                        best_attribute = attribute
                        best_value = value
                        best_data = data2
                    print(best_attribute, '=', best_value, '=>', majority_class(best_data,target))
                    data = data.loc[data[best_attribute] != best_value]   


In [15]:
simpler_rule_learner(df, 'y')

poutcome_failure = 0.0 => 0
poutcome_failure = 0.0 => 0
poutcome_failure = 0.0 => 0
job_entrepreneur = 1.0 => 0
job_entrepreneur = 1.0 => 0
job_entrepreneur = 1.0 => 0
job_entrepreneur = 1.0 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balanc

balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 24

balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 24

balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 24

balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 24

balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 24

balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 24

balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 24

balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 246 => 0
balance = 24

KeyError: ''