# Trabalho 1 - Inteligência Artificial

## Imports necessários

In [36]:
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from sklearn.dummy import DummyClassifier
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

## Base de dados

### Matrícula: 2015100346
Dessa forma, devido ao final da matrícula ser 6, a base de dados será composta pelos 10 descritores de Fourier e os 7 descritores de Hu.

In [37]:
# Leitura dos dados
df = pd.read_csv('https://raw.githubusercontent.com/VitorBonella/PL-Dataset/main/dataset.csv',sep=";") 

# Transformando a coluna id no índice da tabela
df.set_index('id', inplace=True)

# Lista de descritores
FOURIER = ['df01', 'df02', 'df03', 'df04','df05', 'df06', 'df07', 'df08', 'df09', 'df10']
HU = ['i1', 'i2', 'i3', 'i4','i5', 'i6', 'i7']
HARALICK = ['probmax', 'energia', 'entropia', 'contraste','homogeneidade', 'correlacao']

# Descritores que serão usados nesse trabalho
dataset = df[FOURIER + HU] 

# Transformação dos dados de string para float devido ao uso da vírgula ao invés do ponto
dataset = dataset.apply(lambda x: x.str.replace(',', '.').astype(float), axis=1)

# Criação das classes baseada no tipo da lâmpada e na potência
classes = df['tipo_lampada'].str.replace(" ", "") + df['potencia'].astype(str) 

# Adiciona a classe ao data frame da base de dados
# dataset['classe'] = df['tipo_lampada'].str.replace(" ", "") + df['potencia'].astype(str) 

# Define a base de dados e as classes target
# dataset_X = dataset
# dataset_Y = classes

from sklearn import datasets
dataset = datasets.load_breast_cancer()
dataset_X = dataset.data
dataset_Y = dataset.target

# print(classes)
# dataset.head(2)


# Cálculo dos resultados

In [38]:
def classification_report(scores):
    print(f'\nMédia: {scores.mean():.3f}, Desvio Padrão: {scores.std():.3f}')

    inf, sup = stats.norm.interval(0.95, loc=scores.mean(), 
                               scale=scores.std()/np.sqrt(len(scores)))
    
    print(f'Intervalo de confiança (95%): [{inf:.3f},{sup:.3f}]')

# ZeroR (ZR)

In [39]:
zR = DummyClassifier()

pipeline = Pipeline([('transformer', StandardScaler()), ('estimator', zR)])

rkf = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=36851234)

scores_zR = cross_val_score(pipeline, dataset_X, dataset_Y, scoring='accuracy', cv = rkf)

print(scores_zR)

classification_report(scores_zR)

[0.61403509 0.61403509 0.63157895 0.63157895 0.63157895 0.63157895
 0.63157895 0.63157895 0.63157895 0.625      0.61403509 0.61403509
 0.63157895 0.63157895 0.63157895 0.63157895 0.63157895 0.63157895
 0.63157895 0.625      0.61403509 0.61403509 0.63157895 0.63157895
 0.63157895 0.63157895 0.63157895 0.63157895 0.63157895 0.625     ]

Média: 0.627, Desvio Padrão: 0.007
Intervalo de confiança (95%): [0.625,0.630]


# Bagging (BA)

In [40]:
grade = {'estimator__n_estimators':[3, 9, 15, 21]}

# TODO
# Talvez usar um estimador diferente no final do ensemble
# Opções: Decision Tree, Random Forest, K-Nearest Neighbors (KNN), Support Vector Machines (SVM)
bg = BaggingClassifier(estimator=GaussianNB(), random_state=0)

pipeline = Pipeline([('transformer', StandardScaler()), ('estimator', bg)])

gs = GridSearchCV(estimator=pipeline, param_grid=grade, scoring='accuracy', cv=4)

rkf = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=36851234)

scores_BA = cross_val_score(gs, dataset_X, dataset_Y, scoring='accuracy', cv = rkf)

print(scores_BA)

classification_report(scores_BA)

[0.98245614 0.9122807  0.9122807  0.92982456 0.96491228 0.9122807
 0.96491228 0.94736842 0.92982456 0.91071429 0.92982456 0.96491228
 0.98245614 0.9122807  0.96491228 0.94736842 0.92982456 0.87719298
 0.87719298 0.92857143 0.92982456 0.94736842 0.92982456 0.92982456
 0.92982456 0.9122807  0.9122807  0.98245614 0.96491228 0.89285714]

Média: 0.934, Desvio Padrão: 0.028
Intervalo de confiança (95%): [0.924,0.944]


# AdaBoost (AB)

In [41]:
grade = {'estimator__n_estimators':[3, 9, 15, 21]}

# TODO
# Talvez usar um estimador diferente no final do ensemble
# Opções: Decision Tree, Random Forest, K-Nearest Neighbors (KNN), Support Vector Machines (SVM)
adb = AdaBoostClassifier(estimator=GaussianNB(), random_state=0)

pipeline = Pipeline([('transformer', StandardScaler()), ('estimator', adb)])

gs = GridSearchCV(estimator=pipeline, param_grid=grade, scoring='accuracy', cv=4)

rkf = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=36851234)

scores_AB = cross_val_score(gs, dataset_X, dataset_Y, scoring='accuracy', cv = rkf)

print(scores_AB)

classification_report(scores_AB)

[0.9122807  0.89473684 0.96491228 0.89473684 0.94736842 0.59649123
 1.         0.87719298 0.78947368 0.91071429 0.85964912 0.77192982
 0.98245614 0.92982456 0.85964912 0.85964912 0.89473684 0.92982456
 0.8245614  0.92857143 0.78947368 0.94736842 0.75438596 0.9122807
 0.98245614 0.9122807  0.92982456 0.85964912 0.96491228 0.89285714]

Média: 0.886, Desvio Padrão: 0.082
Intervalo de confiança (95%): [0.856,0.915]


# RandomForest (RF)

In [42]:
grade = {'randomForest__n_estimators': [3, 9, 15, 21]}

rF = RandomForestClassifier()

pipeline = Pipeline([('transformer', StandardScaler()), ('randomForest', rF)])

gs = GridSearchCV(estimator=pipeline, param_grid=grade, scoring='accuracy', cv = 4)

rkf = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=36851234)

scores_RF = cross_val_score(gs, dataset_X, dataset_Y, scoring='accuracy', cv = rkf)

print(scores_RF)

classification_report(scores_RF)


[0.98245614 0.9122807  0.96491228 0.96491228 1.         0.92982456
 0.96491228 0.98245614 0.94736842 0.94642857 0.98245614 0.98245614
 1.         0.9122807  0.92982456 0.98245614 0.9122807  0.96491228
 0.92982456 0.96428571 0.98245614 0.94736842 0.96491228 0.96491228
 0.94736842 0.98245614 0.96491228 0.96491228 0.96491228 0.94642857]

Média: 0.960, Desvio Padrão: 0.024
Intervalo de confiança (95%): [0.951,0.968]


# Heterogeneous Pooling (HP)