In [2]:
import pandas as pd
import numpy as np
import random
import re
import nltk
from IPython.core.display import display, HTML
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from matplotlib import pyplot as plt
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [3]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
cd '/content/drive/My Drive/Colab Notebooks/[2022.1] PLN'

/content/drive/My Drive/Colab Notebooks/[2022.1] PLN


In [6]:
!pwd
!ls

/content/drive/My Drive/Colab Notebooks/[2022.1] PLN
classificador.ipynb  imdb-reviews-pt-br.csv


In [7]:
resenha = pd.read_csv('imdb-reviews-pt-br.csv')
resenha.head()

Unnamed: 0,id,text_en,text_pt,sentiment
0,1,Once again Mr. Costner has dragged out a movie...,"Mais uma vez, o Sr. Costner arrumou um filme p...",neg
1,2,This is an example of why the majority of acti...,Este é um exemplo do motivo pelo qual a maiori...,neg
2,3,"First of all I hate those moronic rappers, who...","Primeiro de tudo eu odeio esses raps imbecis, ...",neg
3,4,Not even the Beatles could write songs everyon...,Nem mesmo os Beatles puderam escrever músicas ...,neg
4,5,Brass pictures movies is not a fitting word fo...,Filmes de fotos de latão não é uma palavra apr...,neg


In [8]:
txt_column = resenha['text_en']

In [9]:
def preprocess(description):
  description = re.sub(r'\w*\d\w*', '', description) #remove todas as palavras que contêm números
  description = re.sub(r'[^a-zA-Z ]', '', description.lower())
  return re.sub(r'\s+', ' ', description) #retira espaços repetidos

In [10]:
resenha['text_en'] = resenha.text_en.apply(preprocess)

In [11]:
stop_words = set(nltk.corpus.stopwords.words('english'))
def remove_stopwords(description):
    tokenized_text = nltk.word_tokenize(description, language='english')
    return " ".join([token for token in tokenized_text if token not in stop_words])

In [12]:
resenha['text_en'] = resenha.text_en.apply(remove_stopwords)

In [13]:
resenha.head()

Unnamed: 0,id,text_en,text_pt,sentiment
0,1,mr costner dragged movie far longer necessary ...,"Mais uma vez, o Sr. Costner arrumou um filme p...",neg
1,2,example majority action films generic boring t...,Este é um exemplo do motivo pelo qual a maiori...,neg
2,3,first hate moronic rappers couldnt act gun pre...,"Primeiro de tudo eu odeio esses raps imbecis, ...",neg
3,4,even beatles could write songs everyone liked ...,Nem mesmo os Beatles puderam escrever músicas ...,neg
4,5,brass pictures movies fitting word really some...,Filmes de fotos de latão não é uma palavra apr...,neg


In [14]:
tfidf_X = TfidfVectorizer().fit_transform(resenha.text_en)

In [15]:
count_X = CountVectorizer().fit_transform(resenha.text_en)

In [17]:
tfidf_similarity_matrix = cosine_similarity(tfidf_X)

KeyboardInterrupt: ignored

In [None]:
count_similarity_matrix = cosine_similarity(count_X)

In [1]:
np.fill_diagonal(tfidf_similarity_matrix, 0.0)
np.fill_diagonal(count_similarity_matrix, 0.0)


NameError: ignored

In [None]:
result = [
    {
        'doc': resenha.iloc[i],
        'tfidf_similar_doc': resenha.iloc[tfidf_similarity_matrix[i, :].argmax()],
        'tfidf_similarity': tfidf_similarity_matrix[i, :].max(),
        'count_similar_doc': resenha.iloc[count_similarity_matrix[i, :].argmax()],
        'count_similarity': count_similarity_matrix[i, :].max(),
    }
    for i in [random.randint(0, len(resenha)) for _ in range(10)]
]

In [None]:
%%html
<style>
    .col {
        height: 100%;
        padding-left: 16px;
        padding-right: 16px;
        padding-top: 8px;
        padding-bottom: 8px;
    }
</style>

1. Escolha 10 documentos da base, e determine seu documento mais parecido
(que não pode ser ele mesmo), usando:
a) Representação vetorial CountVectorizer com similaridade do cosseno;
b) Representação vetorial TF-IDF com similaridade do cosseno.

In [None]:
for r in result:
    title = r['doc'].title if r['doc'].title != '' else r['doc'].text_en[:150] + '...'
    movie = '<span>%(title)s</span> <span>%(text_en)s</span>' 
    desc_length = 300
    
    tfidf_content = movie % ({
        'title': r['tfidf_similar_doc'].title,
        'text_en': r['tfidf_similar_doc'].text_en[:desc_length] + '...',
    })
    count_content = movie % ({ 
        'title': r['count_similar_doc'].title,
        'text_en': r['count_similar_doc'].text_en[:desc_length] + '...',
    })
    
    display(HTML('''
    <div style="margin: 8px; border: 1px solid lightgray">
        <div style="text-align: center;background-color: whitesmoke;padding: 8px"><u r['doc'].title >''' + title + '''</u></div>
        <div>
            <div>
                <div class="col">
                <div><strong>TfidfVectorizer: </strong>''' + str(r['tfidf_similarity'])[:6] + '''</div>
                <div>''' + tfidf_content + '''</div>
                </div>
            </div>
            <div>
                <div class="col">
                <div><strong>CountVectorizer: </strong>''' + str(r['count_similarity'])[:6] + '''</div>
                <div>''' + count_content + '''</div>
                </div>
            </div>
        </div>
    </div>
    '''))

Elabore um problema de classificação binária de textos coerente com sua base.

a) Determine o rótulo dos documentos (separando os documentos em classes bem definidas).

b) Extraia as representações vetoriais com CountVectorizer e TF-IDF.

c) Treine um classificador baseado em cada uma das duas representações vetoriais e Regressão Logística usando validação cruzada com 70% das amostras selecionadas para treino e 30% para teste. Exiba as matrizes de confusão, métricas de acurácia, precisão, recall e F1 score.

d) Faça o mesmo para o classificador Naive-Bayes.

e) Faça o mesmo para um outro classificador de sua preferência (pesquise na biblioteca Scikit-learn).

f) Compare os 6 resultados.

2.a) Determine o rótulo dos documentos (separando os documentos em classes bem definidas).

In [None]:
lbl_to_ind = {
    ' neg ': 0, ' pos ': 1, 
}
y = [lbl_to_ind[l] for l in resenha.sentiment.values[:4669]]

2.b) Extraia as representações vetoriais com CountVectorizer e TF-IDF.

In [None]:
tfidf_X_train, tfidf_X_test, tfidf_y_train, tfidf_y_test = train_test_split(tfidf_X, y, test_size=0.3)
count_X_train, count_X_test, count_y_train, count_y_test = train_test_split(count_X, y, test_size=0.3)

2.c) Treine um classificador baseado em cada uma das duas representações vetoriais e Regressão Logística usando validação cruzada com 70% das amostras selecionadas para treino e 30% para teste. Exiba as matrizes de confusão, métricas de acurácia, precisão, recall e F1 score.

In [None]:
cl = LogisticRegression().fit(tfidf_X_train.toarray(), tfidf_y_train)
tfidf_y_pred = cl.predict(tfidf_X_test.toarray())

In [None]:
plot_confusion_matrix(cl, tfidf_X_test.toarray(), tfidf_y_test)
plt.show()

In [None]:
accuracy_lr_tfidf = accuracy_score(tfidf_y_test, tfidf_y_pred)
precision_lr_tfidf = precision_score(tfidf_y_test, tfidf_y_pred)
recall_lr_tfidf = recall_score(tfidf_y_test, tfidf_y_pred)
f1_lr_tfidf = f1_score(tfidf_y_test, tfidf_y_pred)

print("Regressão Logísitca e TF-IDF\nAcurácia: {}\nPrecisão: {}\nRecall: {}\nF1 Score: {}\n".format(accuracy_lr_tfidf, precision_lr_tfidf, recall_lr_tfidf, f1_lr_tfidf))


In [None]:
cl = LogisticRegression().fit(count_X_train.toarray(), count_y_train)
count_y_pred = cl.predict(count_X_test.toarray())

In [None]:
plot_confusion_matrix(cl, count_X_test.toarray(), count_y_test)
plt.show()

In [None]:
accuracy_lr_count = accuracy_score(count_y_test, count_y_pred)
precision_lr_count = precision_score(count_y_test, count_y_pred)
recall_lr_count = recall_score(count_y_test, count_y_pred)
f1_lr_count = f1_score(count_y_test, count_y_pred)

print("Regressão Logísitca e CountVectorizer\nAcurácia: {}\nPrecisão: {}\nRecall: {}\nF1 Score: {}\n".format(accuracy_lr_count, precision_lr_count, recall_lr_count, f1_lr_count))


2.d) Faça o mesmo para o classificador Naive-Bayes.

In [None]:
cl_nb = MultinomialNB().fit(tfidf_X_train.toarray(), tfidf_y_train)
tfidf_y_pred_nb = cl_nb.predict(tfidf_X_test.toarray())

In [None]:
plot_confusion_matrix(cl_nb, tfidf_X_test.toarray(), tfidf_y_test)
plt.show()

In [None]:
accuracy_nb_tfidf = accuracy_score(tfidf_y_test, tfidf_y_pred_nb)
precision_nb_tfidf = precision_score(tfidf_y_test, tfidf_y_pred_nb)
recall_nb_tfidf = recall_score(tfidf_y_test, tfidf_y_pred_nb)
f1_nb_tfidf = f1_score(tfidf_y_test, tfidf_y_pred_nb)

print("Naive Bayes e TF-IDF\nAcurácia: {}\nPrecisão: {}\nRecall: {}\nF1 Score: {}\n".format(accuracy_nb_tfidf, precision_nb_tfidf, recall_nb_tfidf, f1_nb_tfidf))


In [None]:
cl_nb = MultinomialNB().fit(count_X_train.toarray(), count_y_train)
count_y_pred_nb = cl_nb.predict(count_X_test.toarray())

In [None]:
plot_confusion_matrix(cl_nb, count_X_test.toarray(), count_y_test)
plt.show()

In [None]:
accuracy_nb_count = accuracy_score(count_y_test, count_y_pred_nb)
precision_nb_count = precision_score(count_y_test, count_y_pred_nb)
recall_nb_count = recall_score(count_y_test, count_y_pred_nb)
f1_nb_count = f1_score(count_y_test, count_y_pred_nb)

print("Naive Bayes e CountVectorizer\nAcurácia: {}\nPrecisão: {}\nRecall: {}\nF1 Score: {}\n".format(accuracy_nb_count, precision_nb_count, recall_nb_count, f1_nb_count))


2.e) Faça o mesmo para um outro classificador de sua preferência (pesquise na biblioteca Scikit-learn).

In [None]:
cl_svm = LinearSVC().fit(tfidf_X_train.toarray(), tfidf_y_train)
tfidf_y_pred_svm = cl_svm.predict(tfidf_X_test.toarray())

In [None]:
plot_confusion_matrix(cl_svm, tfidf_X_test.toarray(), tfidf_y_test)
plt.show()

In [None]:
accuracy_svm_tfidf = accuracy_score(tfidf_y_test, tfidf_y_pred_svm)
precision_svm_tfidf = precision_score(tfidf_y_test, tfidf_y_pred_svm)
recall_svm_tfidf = recall_score(tfidf_y_test, tfidf_y_pred_svm)
f1_svm_tfidf = f1_score(tfidf_y_test, tfidf_y_pred_svm)

print("SVM e TF-IDF\nAcurácia: {}\nPrecisão: {}\nRecall: {}\nF1 Score: {}\n".format(accuracy_svm_tfidf, precision_svm_tfidf, recall_svm_tfidf, f1_svm_tfidf))


In [None]:
cl_svm = LinearSVC().fit(count_X_train.toarray(), count_y_train)
count_y_pred_svm = cl_svm.predict(count_X_test.toarray())

In [None]:
plot_confusion_matrix(cl_svm, count_X_test.toarray(), count_y_test)
plt.show()

In [None]:
accuracy_svm_count = accuracy_score(count_y_test, count_y_pred_svm)
precision_svm_count = precision_score(count_y_test, count_y_pred_svm)
recall_svm_count = recall_score(count_y_test, count_y_pred_svm)
f1_svm_count = f1_score(count_y_test, count_y_pred_svm)

print("SVM e CountVectorizer\nAcurácia: {}\nPrecisão: {}\nRecall: {}\nF1 Score: {}\n".format(accuracy_svm_count, precision_svm_count, recall_svm_count, f1_svm_count))


2.f) Compare os 6 resultados

In [None]:
lr_count_results = [accuracy_lr_count, precision_lr_count, recall_lr_count, f1_lr_count]
lr_tfidf_results = [accuracy_lr_tfidf, precision_lr_tfidf, recall_lr_tfidf, f1_lr_tfidf]
nb_count_results = [accuracy_nb_count,precision_nb_count, recall_nb_count,f1_nb_count]
nb_tfidf_results = [accuracy_nb_tfidf, precision_nb_tfidf,recall_nb_tfidf, f1_nb_tfidf]
svm_count_results = [accuracy_svm_count,precision_svm_count, recall_svm_count,f1_svm_count]
svm_tfidf_results = [accuracy_svm_tfidf,precision_svm_tfidf, recall_svm_tfidf,f1_svm_tfidf]

data_results2 = {
    'Regression with Count': lr_count_results,
    'Regression with TF-IDF': lr_tfidf_results,
    'Naive Bayes with Count': nb_count_results,
    'Naive Bayes with TF-IDF': nb_tfidf_results,
    'SVM with Count': svm_count_results,
    'SVM with TF-IDF': svm_tfidf_results,
}

index_values2 = ['accuracy', 'precision', 'recall', 'f1']
pd.set_option('display.max_columns', None)

df_results2 = pd.DataFrame(data_results2, index=index_values2)

df_results2

In [None]:
df_results2.plot()
plt.show()