### Atividade de classificação usando redes RAM, pelo modelo WiSARD do aluno **Thiago Ribeiro Aragão**

In [None]:
!pip install ucimlrepo

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.utils import resample


"""
Classe WiSARD criada
"""
class WiSARD:
    def __init__(self, addresSize):
        self.addresSize = addresSize
        self.discriminador = {}

    """Mapeamento dos endereços RAMs"""
    def generateAddress(self, inputData):
        nBits = inputData.shape[1]
        pad = (self.addresSize - (nBits % self.addresSize)) % self.addresSize
        paddedData = np.pad(inputData, ((0, 0), (0, pad)), 'constant')
        addresses = []
        for i in range(0, paddedData.shape[1], self.addresSize):
            addresses.append(paddedData[:, i:i + self.addresSize])
        return addresses

    """Treinamento dos discriminadores"""
    def train(self, X, y):
        classes = np.unique(y)
        self.discriminador = {cls: {} for cls in classes}
        for i, label in zip(X, y):
            addressses = self.generateAddress(i.reshape(1, -1))
            for j in addressses:
                addresTuple = tuple(j[0])
                if addresTuple not in self.discriminador[label]:
                    self.discriminador[label][addresTuple] = 0
                self.discriminador[label][addresTuple] += 1

    """Previsão de qual discriminador indicará a classe"""
    def classify(self, X):
        predictions = []
        for i in X:
            scores = {cls: 0 for cls in self.discriminador.keys()}
            addressses = self.generateAddress(i.reshape(1, -1))
            for j in addressses:
                addresTuple = tuple(j[0])
                for classe, memory in self.discriminador.items():
                    scores[classe] += memory.get(addresTuple, 0)
            predictions.append(max(scores, key=scores.get))
        return predictions

Dataset: nível de obesidade (7 classes de rótulos)
---
Link: https://archive.ics.uci.edu/dataset/544/estimation+of+obesity+levels+based+on+eating+habits+and+physical+condition

In [None]:
"""
Código copiado da página do UCI para import do dataset
"""


from ucimlrepo import fetch_ucirepo

# fetch dataset
estimation_of_obesity_levels_based_on_eating_habits_and_physical_condition = fetch_ucirepo(id=544)

# data (as pandas dataframes)
X = estimation_of_obesity_levels_based_on_eating_habits_and_physical_condition.data.features
y = estimation_of_obesity_levels_based_on_eating_habits_and_physical_condition.data.targets

# metadata
print(estimation_of_obesity_levels_based_on_eating_habits_and_physical_condition.metadata)

# variable information
print(estimation_of_obesity_levels_based_on_eating_habits_and_physical_condition.variables)


{'uci_id': 544, 'name': 'Estimation of Obesity Levels Based On Eating Habits and Physical Condition ', 'repository_url': 'https://archive.ics.uci.edu/dataset/544/estimation+of+obesity+levels+based+on+eating+habits+and+physical+condition', 'data_url': 'https://archive.ics.uci.edu/static/public/544/data.csv', 'abstract': 'This dataset include data for the estimation of obesity levels in individuals from the countries of Mexico, Peru and Colombia, based on their eating habits and physical condition. ', 'area': 'Health and Medicine', 'tasks': ['Classification', 'Regression', 'Clustering'], 'characteristics': ['Multivariate'], 'num_instances': 2111, 'num_features': 16, 'feature_types': ['Integer'], 'demographics': ['Gender', 'Age'], 'target_col': ['NObeyesdad'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 2019, 'last_updated': 'Tue Sep 10 2024', 'dataset_doi': '10.24432/C5H31Z', 'creators': [], 'intro_paper': {'ID': 358, 'type': 

In [None]:
"""Binarização dos dados e correção de inconsistências."""
def binarize(data):
    """Checar missing values"""
    if hasattr(data, 'values'):
        data = data.values

    """Identificar quais colunas são contínuas"""
    continuousFeatures = np.ptp(data, axis=0) > 1
    binarizedData = []

    for i, ehContinuous in enumerate(continuousFeatures):
        column = data[:, i]
        if ehContinuous:
            normalizedFeat = (column - column.min()) / (column.max() - column.min() + 1e-8)
            binarizedFeat = np.unpackbits((normalizedFeat * 255).astype(np.uint8)).reshape(-1, 8)
            binarizedData.append(binarizedFeat)
        else:
            binarizedFeat = column.astype(np.uint8).reshape(-1, 1)
            binarizedData.append(binarizedFeat)

    """Combina todas as colunas binarizadas e empilha horizontalmente"""
    binarizedData = np.hstack(binarizedData)
    return binarizedData



"""Estas etapas de pré-processamento dos dados e divisão do dataset em treino e teste foram ajustadas."""
X = pd.get_dummies(X, drop_first=True)
y = LabelEncoder().fit_transform(np.array(y).ravel())

X_treino, X_teste, y_treino, y_teste = train_test_split(X, y, test_size=0.7, random_state=42)

scaler = StandardScaler()
X_treino = scaler.fit_transform(X_treino)
X_teste = scaler.transform(X_teste)

"""Conjuntos teste e treino agora 'binarizados'"""
X_treino_bin = binarize(X_treino)
X_teste_bin = binarize(X_teste)

"""Treino"""
addressize = max(4, min(8, int(np.log2(X_treino_bin.shape[1]))))
model = WiSARD(addressize)
model.train(X_treino_bin, y_treino)

"""Previsão"""
yPrevisto = model.classify(X_teste_bin)

"""Hora da verdade"""
accuracy = accuracy_score(y_teste, yPrevisto)
print("Acurácia: " + str(round(accuracy, 2)*100) + "%")

Acurácia: 14.000000000000002%


Dataset: avaliação de carro (4 classes de rótulos)
---
Link: https://archive.ics.uci.edu/dataset/19/car+evaluation

In [None]:
"""
Código copiado da página do UCI para import do dataset
"""


from ucimlrepo import fetch_ucirepo

# fetch dataset
car_evaluation = fetch_ucirepo(id=19)

# data (as pandas dataframes)
X = car_evaluation.data.features
y = car_evaluation.data.targets

# metadata
print(car_evaluation.metadata)

# variable information
print(car_evaluation.variables)


{'uci_id': 19, 'name': 'Car Evaluation', 'repository_url': 'https://archive.ics.uci.edu/dataset/19/car+evaluation', 'data_url': 'https://archive.ics.uci.edu/static/public/19/data.csv', 'abstract': 'Derived from simple hierarchical decision model, this database may be useful for testing constructive induction and structure discovery methods.', 'area': 'Other', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 1728, 'num_features': 6, 'feature_types': ['Categorical'], 'demographics': [], 'target_col': ['class'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 1988, 'last_updated': 'Thu Aug 10 2023', 'dataset_doi': '10.24432/C5JP48', 'creators': ['Marko Bohanec'], 'intro_paper': {'ID': 249, 'type': 'NATIVE', 'title': 'Knowledge acquisition and explanation for multi-attribute decision making', 'authors': 'M. Bohanec, V. Rajkovič', 'venue': '8th Intl Workshop on Expert Systems and their Applications, 

In [None]:
"""Binarização dos dados e correção de inconsistências."""
def binarize(data):
    """Checar missing values"""
    if hasattr(data, 'values'):
        data = data.values

    """Identificar quais colunas são contínuas"""
    continuousFeatures = np.ptp(data, axis=0) > 1
    binarizedData = []

    for i, ehContinuous in enumerate(continuousFeatures):
        column = data[:, i]
        if ehContinuous:
            normalizedFeat = (column - column.min()) / (column.max() - column.min() + 1e-8)
            binarizedFeat = np.unpackbits((normalizedFeat * 255).astype(np.uint8)).reshape(-1, 8)
            binarizedData.append(binarizedFeat)
        else:
            binarizedFeat = column.astype(np.uint8).reshape(-1, 1)
            binarizedData.append(binarizedFeat)

    """Combina todas as colunas binarizadas e empilha horizontalmente"""
    binarizedData = np.hstack(binarizedData)
    return binarizedData



"""Estas etapas de pré-processamento dos dados e divisão do dataset em treino e teste foram ajustadas."""
X = pd.get_dummies(X, drop_first=True)
y = LabelEncoder().fit_transform(np.array(y).ravel())

X_treino, X_teste, y_treino, y_teste = train_test_split(X, y, test_size=0.7, random_state=42)

scaler = StandardScaler()
X_treino = scaler.fit_transform(X_treino)
X_teste = scaler.transform(X_teste)

"""Conjuntos teste e treino agora 'binarizados'"""
X_treino_bin = binarize(X_treino)
X_teste_bin = binarize(X_teste)

"""Treino"""
addressize = max(4, min(8, int(np.log2(X_treino_bin.shape[1]))))
model = WiSARD(addressize)
model.train(X_treino_bin, y_treino)

"""Previsão"""
yPrevisto = model.classify(X_teste_bin)

"""Hora da verdade"""
accuracy = accuracy_score(y_teste, yPrevisto)
print("Acurácia: " + str(round(accuracy, 2)*100) + "%")

Acurácia: 71.0%


Dataset: tamanhos de lentes de grau para pacientes (3 classes de rótulos)
---
Link: https://archive.ics.uci.edu/dataset/58/lenses

In [None]:
"""
Código copiado da página do UCI para import do dataset
"""


from ucimlrepo import fetch_ucirepo

# fetch dataset
lenses = fetch_ucirepo(id=58)

# data (as pandas dataframes)
X = lenses.data.features
y = lenses.data.targets

# metadata
print(lenses.metadata)

# variable information
print(lenses.variables)


{'uci_id': 58, 'name': 'Lenses', 'repository_url': 'https://archive.ics.uci.edu/dataset/58/lenses', 'data_url': 'https://archive.ics.uci.edu/static/public/58/data.csv', 'abstract': 'Database for fitting contact lenses', 'area': 'Other', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 24, 'num_features': 3, 'feature_types': ['Categorical'], 'demographics': ['Age'], 'target_col': ['class'], 'index_col': ['id'], 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 1987, 'last_updated': 'Mon Feb 26 2024', 'dataset_doi': '10.24432/C5K88Z', 'creators': ['J. Cendrowska'], 'intro_paper': None, 'additional_info': {'summary': 'The examples are complete and noise free. The examples highly simplified the problem. The attributes do not fully describe all the factors affecting the decision as to which type, if any, to fit.\r\n\r\n Notes:  \r\n\r\n--This database is complete (all possible combinations of attribute-value pairs are re

In [None]:
"""Binarização dos dados e correção de inconsistências."""
def binarize(data):
    """Checar missing values"""
    if hasattr(data, 'values'):
        data = data.values

    """Identificar quais colunas são contínuas"""
    continuousFeatures = np.ptp(data, axis=0) > 1
    binarizedData = []

    for i, ehContinuous in enumerate(continuousFeatures):
        column = data[:, i]
        if ehContinuous:
            normalizedFeat = (column - column.min()) / (column.max() - column.min() + 1e-8)
            binarizedFeat = np.unpackbits((normalizedFeat * 255).astype(np.uint8)).reshape(-1, 8)
            binarizedData.append(binarizedFeat)
        else:
            binarizedFeat = column.astype(np.uint8).reshape(-1, 1)
            binarizedData.append(binarizedFeat)

    """Combina todas as colunas binarizadas e empilha horizontalmente"""
    binarizedData = np.hstack(binarizedData)
    return binarizedData



"""Estas etapas de pré-processamento dos dados e divisão do dataset em treino e teste foram ajustadas."""
X = pd.get_dummies(X, drop_first=True)
y = LabelEncoder().fit_transform(np.array(y).ravel())

X_treino, X_teste, y_treino, y_teste = train_test_split(X, y, test_size=0.7, random_state=42)

scaler = StandardScaler()
X_treino = scaler.fit_transform(X_treino)
X_teste = scaler.transform(X_teste)

"""Conjuntos teste e treino agora 'binarizados'"""
X_treino_bin = binarize(X_treino)
X_teste_bin = binarize(X_teste)

"""Treino"""
addressize = max(4, min(8, int(np.log2(X_treino_bin.shape[1]))))
model = WiSARD(addressize)
model.train(X_treino_bin, y_treino)

"""Previsão"""
yPrevisto = model.classify(X_teste_bin)

"""Hora da verdade"""
accuracy = accuracy_score(y_teste, yPrevisto)
print("Acurácia: " + str(round(accuracy, 2)*100) + "%")

Acurácia: 59.0%


Dataset: Predict Students' Dropout and Academic Success
---
Link: https://archive.ics.uci.edu/dataset/697/predict+students+dropout+and+academic+success

In [None]:
"""
Código copiado da página do UCI para import do dataset
"""


from ucimlrepo import fetch_ucirepo

# fetch dataset
predict_students_dropout_and_academic_success = fetch_ucirepo(id=697)

# data (as pandas dataframes)
X = predict_students_dropout_and_academic_success.data.features
y = predict_students_dropout_and_academic_success.data.targets

# metadata
print(predict_students_dropout_and_academic_success.metadata)

# variable information
print(predict_students_dropout_and_academic_success.variables)


{'uci_id': 697, 'name': "Predict Students' Dropout and Academic Success", 'repository_url': 'https://archive.ics.uci.edu/dataset/697/predict+students+dropout+and+academic+success', 'data_url': 'https://archive.ics.uci.edu/static/public/697/data.csv', 'abstract': "A dataset created from a higher education institution (acquired from several disjoint databases) related to students enrolled in different undergraduate degrees, such as agronomy, design, education, nursing, journalism, management, social service, and technologies.\nThe dataset includes information known at the time of student enrollment (academic path, demographics, and social-economic factors) and the students' academic performance at the end of the first and second semesters. \nThe data is used to build classification models to predict students' dropout and academic sucess. The problem is formulated as a three category classification task, in which there is a strong imbalance towards one of the classes.", 'area': 'Social Sc

In [None]:
"""Binarização dos dados e correção de inconsistências."""
def binarize(data):
    """Checar missing values"""
    if hasattr(data, 'values'):
        data = data.values

    """Identificar quais colunas são contínuas"""
    continuousFeatures = np.ptp(data, axis=0) > 1
    binarizedData = []

    for i, ehContinuous in enumerate(continuousFeatures):
        column = data[:, i]
        if ehContinuous:
            normalizedFeat = (column - column.min()) / (column.max() - column.min() + 1e-8)
            binarizedFeat = np.unpackbits((normalizedFeat * 255).astype(np.uint8)).reshape(-1, 8)
            binarizedData.append(binarizedFeat)
        else:
            binarizedFeat = column.astype(np.uint8).reshape(-1, 1)
            binarizedData.append(binarizedFeat)

    """Combina todas as colunas binarizadas e empilha horizontalmente"""
    binarizedData = np.hstack(binarizedData)
    return binarizedData



"""Estas etapas de pré-processamento dos dados e divisão do dataset em treino e teste foram ajustadas."""
X = pd.get_dummies(X, drop_first=True)
y = LabelEncoder().fit_transform(np.array(y).ravel())

X_treino, X_teste, y_treino, y_teste = train_test_split(X, y, test_size=0.7, random_state=42)

scaler = StandardScaler()
X_treino = scaler.fit_transform(X_treino)
X_teste = scaler.transform(X_teste)

"""Conjuntos teste e treino agora 'binarizados'"""
X_treino_bin = binarize(X_treino)
X_teste_bin = binarize(X_teste)

"""Treino"""
addressize = max(4, min(8, int(np.log2(X_treino_bin.shape[1]))))
model = WiSARD(addressize)
model.train(X_treino_bin, y_treino)

"""Previsão"""
yPrevisto = model.classify(X_teste_bin)

"""Hora da verdade"""
accuracy = accuracy_score(y_teste, yPrevisto)
print("Acurácia: " + str(round(accuracy, 2)*100) + "%")

Acurácia: 49.0%


Dataset: Zoo
---
Link: https://archive.ics.uci.edu/dataset/111/zoo

In [None]:
"""
Código copiado da página do UCI para import do dataset
"""


from ucimlrepo import fetch_ucirepo

# fetch dataset
zoo = fetch_ucirepo(id=111)

# data (as pandas dataframes)
X = zoo.data.features
y = zoo.data.targets

# metadata
print(zoo.metadata)

# variable information
print(zoo.variables)


{'uci_id': 111, 'name': 'Zoo', 'repository_url': 'https://archive.ics.uci.edu/dataset/111/zoo', 'data_url': 'https://archive.ics.uci.edu/static/public/111/data.csv', 'abstract': 'Artificial, 7 classes of animals', 'area': 'Biology', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 101, 'num_features': 16, 'feature_types': ['Categorical', 'Integer'], 'demographics': [], 'target_col': ['type'], 'index_col': ['animal_name'], 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 1990, 'last_updated': 'Fri Sep 15 2023', 'dataset_doi': '10.24432/C5R59V', 'creators': ['Richard Forsyth'], 'intro_paper': None, 'additional_info': {'summary': 'A simple database containing 17 Boolean-valued attributes.  The "type" attribute appears to be the class attribute.  Here is a breakdown of which animals are in which type: (I find it unusual that there are 2 instances of "frog" and one of "girl"!)', 'purpose': None, 'funded_by': None, 'inst

In [None]:
"""Binarização dos dados e correção de inconsistências."""
def binarize(data):
    """Checar missing values"""
    if hasattr(data, 'values'):
        data = data.values

    """Identificar quais colunas são contínuas"""
    continuousFeatures = np.ptp(data, axis=0) > 1
    binarizedData = []

    for i, ehContinuous in enumerate(continuousFeatures):
        column = data[:, i]
        if ehContinuous:
            normalizedFeat = (column - column.min()) / (column.max() - column.min() + 1e-8)
            binarizedFeat = np.unpackbits((normalizedFeat * 255).astype(np.uint8)).reshape(-1, 8)
            binarizedData.append(binarizedFeat)
        else:
            binarizedFeat = column.astype(np.uint8).reshape(-1, 1)
            binarizedData.append(binarizedFeat)

    """Combina todas as colunas binarizadas e empilha horizontalmente"""
    binarizedData = np.hstack(binarizedData)
    return binarizedData



"""Estas etapas de pré-processamento dos dados e divisão do dataset em treino e teste foram ajustadas."""
X = pd.get_dummies(X, drop_first=True)
y = LabelEncoder().fit_transform(np.array(y).ravel())

X_treino, X_teste, y_treino, y_teste = train_test_split(X, y, test_size=0.7, random_state=42)

scaler = StandardScaler()
X_treino = scaler.fit_transform(X_treino)
X_teste = scaler.transform(X_teste)

"""Conjuntos teste e treino agora 'binarizados'"""
X_treino_bin = binarize(X_treino)
X_teste_bin = binarize(X_teste)

"""Treino"""
addressize = max(4, min(8, int(np.log2(X_treino_bin.shape[1]))))
model = WiSARD(addressize)
model.train(X_treino_bin, y_treino)

"""Previsão"""
yPrevisto = model.classify(X_teste_bin)

"""Hora da verdade"""
accuracy = accuracy_score(y_teste, yPrevisto)
print("Acurácia: " + str(round(accuracy, 2)*100) + "%")

Acurácia: 13.0%
