Procedure:
* Identify a suitable preprocessing method for data transformation (continuous to discrete).
* Compute
  * 1) the size of possible instances,
  * 2) the size of hypothesis space (the number of possible extensions), and
  * 3) the size of hypothesis space, taking into account only the number of possible conjunctive concepts according to the descriptions in Section 4.1 of the main literature.
* Implement the algorithm and verify that it works as expected.
* Compute the accuracy of the model and report the generated model, i.e., the conjunctive rule.


---
Import libraries:

In [603]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import array
import collections
from sklearn.preprocessing import KBinsDiscretizer
import sys
from sklearn.model_selection import train_test_split
from tensorflow import keras

Import the dataset and setting relevant column names:

In [604]:
Dataset = pd.read_csv("/content/spambase.csv")
# Read the ".data" file as if it was ".csv" but keep the .data file ending
# Make sure the database can be read before executing.
# Google Colab removes the spambase files regularly so you may need to upload them again.

Dataset.columns = ['word_freq_make', 'word_freq_address', 'word_freq_all', 'word_freq_3d', 'word_freq_our', 'word_freq_over','word_freq_remove', 'word_freq_internet', 'word_freq_order', 'word_freq_mail', 'word_freq_receive', 'word_freq_will', 'word_freq_people', 'word_freq_report', 'word_freq_addresses', 'word_freq_free', 'word_freq_business', 'word_freq_email', 'word_freq_you', 'word_freq_credit', 'word_freq_your', 'word_freq_font', 'word_freq_000', 'word_freq_money', 'word_freq_hp', 'word_freq_hpl', 'word_freq_george', 'word_freq_650', 'word_freq_lab', 'word_freq_labs', 'word_freq_telnet', 'word_freq_857', 'word_freq_data', 'word_freq_415', 'word_freq_85', 'word_freq_technology', 'word_freq_1999', 'word_freq_parts', 'word_freq_pm', 'word_freq_direct', 'word_freq_cs', 'word_freq_meeting', 'word_freq_original', 'word_freq_project', 'word_freq_re', 'word_freq_edu', 'word_freq_table', 'word_freq_conference', 'word_freq_;', 'word_freq_(', 'word_freq_[', 'word_freq_!', 'word_freq_$', 'word_freq_#', 'capital_run_length_average', 'capital_run_length_longest', 'capital_run_length_total', 'is_spam']


In [605]:
Dataset

Unnamed: 0,word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,...,word_freq_;,word_freq_(,word_freq_[,word_freq_!,word_freq_$,word_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total,is_spam
0,0.21,0.28,0.50,0.0,0.14,0.28,0.21,0.07,0.00,0.94,...,0.000,0.132,0.0,0.372,0.180,0.048,5.114,101,1028,1
1,0.06,0.00,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.010,0.143,0.0,0.276,0.184,0.010,9.821,485,2259,1
2,0.00,0.00,0.00,0.0,0.63,0.00,0.31,0.63,0.31,0.63,...,0.000,0.137,0.0,0.137,0.000,0.000,3.537,40,191,1
3,0.00,0.00,0.00,0.0,0.63,0.00,0.31,0.63,0.31,0.63,...,0.000,0.135,0.0,0.135,0.000,0.000,3.537,40,191,1
4,0.00,0.00,0.00,0.0,1.85,0.00,0.00,1.85,0.00,0.00,...,0.000,0.223,0.0,0.000,0.000,0.000,3.000,15,54,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4595,0.31,0.00,0.62,0.0,0.00,0.31,0.00,0.00,0.00,0.00,...,0.000,0.232,0.0,0.000,0.000,0.000,1.142,3,88,0
4596,0.00,0.00,0.00,0.0,0.00,0.00,0.00,0.00,0.00,0.00,...,0.000,0.000,0.0,0.353,0.000,0.000,1.555,4,14,0
4597,0.30,0.00,0.30,0.0,0.00,0.00,0.00,0.00,0.00,0.00,...,0.102,0.718,0.0,0.000,0.000,0.000,1.404,6,118,0
4598,0.96,0.00,0.00,0.0,0.32,0.00,0.00,0.00,0.00,0.00,...,0.000,0.057,0.0,0.000,0.000,0.000,1.147,5,78,0


In [606]:
x = Dataset.iloc[:,: -3]
y = Dataset.iloc[:,-1]
pos_res = x.iloc[: 1812] 
neg_res = x.iloc[1813 :]

Preprocessing method: Discretization using "sklearn.preprocessing.KBinsDiscretizer"

In [607]:
number_of_bins = 5
Dataset_discrete = KBinsDiscretizer(n_bins=number_of_bins, encode="ordinal", strategy="uniform").fit_transform(pos_res)
# Info: https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.KBinsDiscretizer.html

In [608]:
Dataset_discrete
type(Dataset_discrete)

numpy.ndarray

In [609]:
Dataset_discrete.shape
Dataset_discrete.astype(np.int64)

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

The array "Dataset_discrete" can now be used.

Lowest to highest value found in each column: 0 - number_of_bins-1.

In [610]:
# Filter out all instances of not spam
# D = np.empty(shape=(0,55))
# for x in Dataset_discrete:
#   if x[-1] == number_of_bins-1:
#     D = np.append(D, np.array([x]), axis=0)
#     print(D)

# D will only contains positive results of spam

In [611]:
#number of instances
unique, counts = np.unique(Dataset_discrete, return_counts=True)
instances = dict(zip(unique, counts))
print(instances)
#unique instances
instance = 1
data = np.ndarray.tolist(Dataset_discrete)
for i in range(0, 58):
  list = []
  instance *= len(set(data[i]))
print(instance)

{0.0: 95815, 1.0: 3063, 2.0: 550, 3.0: 128, 4.0: 104}
14427791579676672


In [612]:
#Hypothesis space
# sys.set_int_max_str_digits(14495514624)
# space_size = 5**14495514624
# space_size

In [613]:
#4.2
def LGG_conj(x, y):
  z = [i for i in x if i in y]
  return z
  

In [614]:
#First instance ("D" contains only spam)
#4.1

def LGG(Dataset_discrete):
  H = []
  x = Dataset_discrete[0]
  H = np.append(x, H)
  for i in range(len(Dataset_discrete)):
    x = Dataset_discrete[i]
    H = LGG_conj(H, x)
    return H


In [615]:
# Select ratio
ratio = 0.8

total_rows = Dataset_discrete.shape[0]
train_size = int(total_rows*ratio)

# Split data into test and train
train = Dataset_discrete[0:train_size]
test = Dataset_discrete[train_size:]
print(LGG(train))
print(LGG(test))

[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]


In [616]:
y_true = LGG(train)
y_pred = LGG(test)
def compute_accuracy(y_true, y_pred):
    correct_predictions = 0
    # iterate over each label and check if it is True
    for true, predicted in zip(y_true, y_pred):
        if true == predicted:
            correct_predictions += 1
    # compute the accuracy using the true values
    accuracy = (correct_predictions/len(y_true))
    return accuracy
compute_accuracy(y_true, y_pred)

0.9818181818181818