## Trabalho Prático Apredizagem Automática

In [97]:
import numpy as np
import matplotlib.pyplot as plt
import pickle
import re

import mglearn
import spacy
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, SnowballStemmer, LancasterStemmer, WordNetLemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.pipeline import make_pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC


from sklearn.metrics import confusion_matrix, classification_report



[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\anaso\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\anaso\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Pré-Processamento

In [5]:
def limpeza(critica): 
    critica = critica.replace("<br />", " ")
    critica = re.sub(r'[^a-zA-Z]+', ' ', critica.lower())
    return critica

def stop_words(critica): 
    stop_words = set(stopwords.words('english'))
    no_stop_words = " ".join((palavra if palavra not in stop_words else " ") for palavra in critica.split()) 
    return no_stop_words

def lematizacao(critica, lemma=WordNetLemmatizer()): 
    lemma = " ".join((lemma.lemmatize(palavra)) for palavra in critica.split())
    return lemma


In [6]:
def text2vector(corpus, tfidf=TfidfVectorizer(), treino=True):

    docs = [limpeza(texto) for texto in corpus]
    docs = [stop_words(texto) for texto in docs] 
    docs = [lematizacao(texto) for texto in docs]

    if(treino):
        modelo = tfidf.fit(docs)
        Dict = {'tfidf_model': modelo}
        pickle.dump(Dict, open('models/tfidf.p', 'wb'))
        
    else:
        Dict = pickle.load(open('models/tfidf.p', 'rb'))
        tfidf = Dict['tfidf_model']

    return tfidf.transform(docs)


In [80]:
def binClassify(X, y, model='LogisticRegression', treino=True):

    if(len(np.unique(y))>2):
        raise ValueError("Targets errados. ")
    
    else:
        modelos = {
            "LogisticRegression": LogisticRegression(), 
            "RandomForestClassifier": RandomForestClassifier(), 
            "SupportVectorMachines": SVC()
        }    

        modelo = modelos.get(model)

        if(treino and y.all()!=None): 
            modelo = modelo.fit(X, y)
            Dict = {model: modelo}
            pickle.dump(Dict, open('models/'+model+'.p', 'wb'))

        elif(not treino):
            Dict = pickle.load(open('models/'+model+'.p', 'rb'))
            modelo = Dict[model]
        
        return modelo.predict(X)

In [101]:
def multiClassify(X, y=None, model='LogisticRegression', treino=True):

    if(len(np.unique(y))<=2):
        raise ValueError("Targets errados. ")
    
    else:
        modelos = {
            "LogisticRegression": LogisticRegression(max_iter=10000), 
            "RandomForestClassifier": RandomForestClassifier(), 
            "SupportVectorMachines": SVC()
        }    

        modelo = modelos.get(model)
        
        if(treino and y.all()!=None): 
            modelo = modelo.fit(X, y)
            Dict = {model: modelo}
            pickle.dump(Dict, open('models/'+model+'.p', 'wb'))

        elif(not treino):
            Dict = pickle.load(open('models/'+model+'.p', 'rb'))
            modelo = Dict[model]
        
        return modelo.predict(X)

In [9]:
D = pickle.load(open("/Users/anaso/Documents/AA/AA-22-23/Trabalho Final/imdbFull.p", 'rb'))

corpus = D.data
y = D.target

yb = [1 if val>5 else 0 for val in y]

X = text2vector(corpus, tfidf=TfidfVectorizer())


In [10]:
y_bin = binClassify(X, yb, treino=True)


In [62]:
y_multi = multiClassify(X, y, treino=True)

In [12]:
print(confusion_matrix(yb, y_bin))
print(confusion_matrix(y, y_multi))

[[23067  1933]
 [ 1496 23504]]
[[9249  113  196  225   35   65   15  224]
 [1851 1643  326  414   52   73   18  209]
 [1224  114 2667  463  106   92   33  262]
 [ 846   79  280 3467  143  173   43  300]
 [ 178   24  100  245 2696  546  123  891]
 [ 145   16   64  126  260 3631  152 1465]
 [ 124   10   30   73  259  558 1666 1887]
 [ 202   14   46   70  182  387  105 8725]]


In [13]:
print(classification_report(yb, y_bin))
print("_______________")
print(classification_report(y, y_multi))

              precision    recall  f1-score   support

           0       0.94      0.92      0.93     25000
           1       0.92      0.94      0.93     25000

    accuracy                           0.93     50000
   macro avg       0.93      0.93      0.93     50000
weighted avg       0.93      0.93      0.93     50000

_______________
              precision    recall  f1-score   support

           1       0.67      0.91      0.77     10122
           2       0.82      0.36      0.50      4586
           3       0.72      0.54      0.62      4961
           4       0.68      0.65      0.67      5331
           7       0.72      0.56      0.63      4803
           8       0.66      0.62      0.64      5859
           9       0.77      0.36      0.49      4607
          10       0.62      0.90      0.74      9731

    accuracy                           0.67     50000
   macro avg       0.71      0.61      0.63     50000
weighted avg       0.69      0.67      0.66     50000



Divisão:

In [83]:
# Multiclasse
texto_treino, texto_teste, y_treino, y_teste = train_test_split(corpus, y, train_size=0.5, stratify=y)

Treino:

In [89]:
X1 = text2vector(texto_treino, treino=True)

In [90]:
y_multi_treino = multiClassify(X1, y_treino, treino=True)

In [86]:
print(confusion_matrix(y_treino, y_multi_treino))
print(classification_report(y_treino, y_multi_treino))

[[4765   25   57   69   19   16    9  101]
 [ 895  968   88  162   19   32    4  125]
 [ 607   35 1478  134   44   36   13  133]
 [ 410   19   61 1906   49   68   13  139]
 [ 107    8   28   75 1553  153   26  452]
 [  87    5   21   36   94 1992   36  659]
 [  72    6   12   23   85  219  995  891]
 [ 108    4   13   25   61  135   30 4490]]
              precision    recall  f1-score   support

           1       0.68      0.94      0.79      5061
           2       0.90      0.42      0.58      2293
           3       0.84      0.60      0.70      2480
           4       0.78      0.72      0.75      2665
           7       0.81      0.65      0.72      2402
           8       0.75      0.68      0.71      2930
           9       0.88      0.43      0.58      2303
          10       0.64      0.92      0.76      4866

    accuracy                           0.73     25000
   macro avg       0.79      0.67      0.70     25000
weighted avg       0.76      0.73      0.71     25000



Teste: 

In [91]:
X2 = text2vector(texto_teste, treino=False)


In [92]:
y_multi_teste = multiClassify(X2, y_teste, treino=False)

In [93]:
print(confusion_matrix(y_teste, y_multi_teste))
print(classification_report(y_teste, y_multi_teste))

[[4158  151  202  225   30   56   11  228]
 [1292  147  249  343   38   63    7  154]
 [ 982  152  393  516   98   97   22  221]
 [ 669   96  350  828  202  224   31  266]
 [ 141   18   72  215  518  654  104  679]
 [ 126   19   44  129  402  748  166 1295]
 [  81    8   19   63  172  422  140 1399]
 [ 217   15   28   71  149  456  161 3768]]
              precision    recall  f1-score   support

           1       0.54      0.82      0.65      5061
           2       0.24      0.06      0.10      2293
           3       0.29      0.16      0.20      2481
           4       0.35      0.31      0.33      2666
           7       0.32      0.22      0.26      2401
           8       0.28      0.26      0.26      2929
           9       0.22      0.06      0.10      2304
          10       0.47      0.77      0.59      4865

    accuracy                           0.43     25000
   macro avg       0.34      0.33      0.31     25000
weighted avg       0.37      0.43      0.38     25000



In [98]:
y_multi_treino = multiClassify(X1, y_treino, model="RandomForestClassifier", treino=True)
y_multi_teste = multiClassify(X2, y_teste, model="RandomForestClassifier", treino=False)

print("Random Forest Classifier: ")
print("Treino: ")
print(confusion_matrix(y_treino, y_multi_treino))
print(classification_report(y_treino, y_multi_treino))

print("Teste:")
print(confusion_matrix(y_teste, y_multi_teste))
print(classification_report(y_teste, y_multi_teste))

Random Forest Classifier: 
Treino: 
[[5061    0    0    0    0    0    0    0]
 [   0 2293    0    0    0    0    0    0]
 [   0    0 2480    0    0    0    0    0]
 [   0    0    1 2664    0    0    0    0]
 [   0    0    0    0 2402    0    0    0]
 [   0    0    0    0    0 2929    0    1]
 [   0    0    0    0    0    0 2303    0]
 [   0    0    0    0    0    0    0 4866]]
              precision    recall  f1-score   support

           1       1.00      1.00      1.00      5061
           2       1.00      1.00      1.00      2293
           3       1.00      1.00      1.00      2480
           4       1.00      1.00      1.00      2665
           7       1.00      1.00      1.00      2402
           8       1.00      1.00      1.00      2930
           9       1.00      1.00      1.00      2303
          10       1.00      1.00      1.00      4866

    accuracy                           1.00     25000
   macro avg       1.00      1.00      1.00     25000
weighted avg       1.00

In [102]:
y_multi_treino = multiClassify(X1, y_treino, model="SupportVectorMachines", treino=True)
y_multi_teste = multiClassify(X2, y_teste, model="SupportVectorMachines", treino=False)

print("Support Vector Machines (SVM): ")
print("Treino: ")
print(confusion_matrix(y_treino, y_multi_treino))
print(classification_report(y_treino, y_multi_treino))

print("Teste:")
print(confusion_matrix(y_teste, y_multi_teste))
print(classification_report(y_teste, y_multi_teste))


SVC()
SVC()
Support Vector Machines (SVM): 
Treino: 
[[5042    0    2    3    1    1    0   12]
 [ 244 2025    2    7    1    3    1   10]
 [ 129    0 2325   11    4    1    0   10]
 [  83    0    0 2560    2    5    0   15]
 [  17    0    0    6 2279   12    1   87]
 [  12    0    1    2    2 2785    0  128]
 [  19    1    2    1   11   41 1986  242]
 [  17    0    1    1    2    4    0 4841]]
              precision    recall  f1-score   support

           1       0.91      1.00      0.95      5061
           2       1.00      0.88      0.94      2293
           3       1.00      0.94      0.97      2480
           4       0.99      0.96      0.97      2665
           7       0.99      0.95      0.97      2402
           8       0.98      0.95      0.96      2930
           9       1.00      0.86      0.93      2303
          10       0.91      0.99      0.95      4866

    accuracy                           0.95     25000
   macro avg       0.97      0.94      0.95     25000
weight