# Mineração de texto - recursos introdutórios

In [1]:
!pip install nltk
!pip install spacy
!pip install transformers



In [19]:
import pandas as pd
import re
import nltk
import spacy
import string
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
## Garantir que tenha pelo menos esses dados:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('vader_lexicon')

#Você pode instalar tudo disponível executando essa linha
#nltk.download()

[nltk_data] Downloading package stopwords to /home/thiago/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/thiago/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/thiago/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

## Preprocessamento básico com NTLK 

#### Separação de sentenças

In [20]:
from nltk.tokenize import sent_tokenize

texto = "A fazenda é muito grande. O Dr. da Fazenda possui vários animais. Aqui temos um olho d'água."

sentencas = sent_tokenize(texto)

for sent in sentencas:
    print(sent)

A fazenda é muito grande.
O Dr. da Fazenda possui vários animais.
Aqui temos um olho d'água.


#### Tokenização

In [21]:
from nltk.tokenize import word_tokenize

texto = "A fazenda é muito grande. O Dr. da Fazenda possui vários animais. Aqui temos um olho d'água."

palavras = word_tokenize(texto) 

for palavra in palavras:
    print(palavra)

A
fazenda
é
muito
grande
.
O
Dr.
da
Fazenda
possui
vários
animais
.
Aqui
temos
um
olho
d'água
.


### Exemplificando com dados de reviews do Yelp

In [22]:
dfCharlotte= pd.read_csv("reviewsTestCharlotte.csv")

reviewsCharlotte_original = dfCharlotte['text']
reviewsCharlotte_original

reviewsCharlotte_original[0]

'I loved the customer service! Fast, but the doctor took her time to make sure I was happy in my contact lenses. Thank you!'

In [23]:
reviewsCharlotte_original.shape

(48617,)

#### Convertendo para minúsculo

In [24]:
reviewsCharlotte_prep = reviewsCharlotte_original.str.lower()
reviewsCharlotte_prep

0        i loved the customer service! fast, but the do...
1        i've been coming here for a couple of years bu...
2        i really liked the optometrist here. the wait ...
3        robert and his team took great care of me when...
4        edit: i'm lowering this review from 5-stars to...
                               ...                        
48612    we've now had mike out twice to fix some leaks...
48613    love the new location lauren is always right o...
48614    i completely ruined my hair today attempting t...
48615    i have been going to lauren for years now and ...
48616    always available when i need her, lauren rocks...
Name: text, Length: 48617, dtype: object

#### Remove pontuação

In [25]:
PONTUACAO = string.punctuation

def remove_pontuacao(text):
    return text.translate(str.maketrans('', '', PONTUACAO))

reviewsCharlotte_prep = reviewsCharlotte_prep.apply(lambda text: remove_pontuacao(text))
reviewsCharlotte_prep

0        i loved the customer service fast but the doct...
1        ive been coming here for a couple of years but...
2        i really liked the optometrist here the wait w...
3        robert and his team took great care of me when...
4        edit im lowering this review from 5stars to 3s...
                               ...                        
48612    weve now had mike out twice to fix some leaks ...
48613    love the new location lauren is always right o...
48614    i completely ruined my hair today attempting t...
48615    i have been going to lauren for years now and ...
48616    always available when i need her lauren rocks ...
Name: text, Length: 48617, dtype: object

#### Remove stopwords

In [27]:
from nltk.corpus import stopwords

STOPWORDS = set(stopwords.words('english'))

def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

reviewsCharlotte_prep = reviewsCharlotte_prep.apply(lambda text: remove_stopwords(text))
print(reviewsCharlotte_prep)

print(STOPWORDS)


0        loved customer service fast doctor took time m...
1        ive coming couple years optometrist business b...
2        really liked optometrist wait little longer wo...
3        robert team took great care repairing car back...
4        edit im lowering review 5stars 3stars wrote or...
                               ...                        
48612    weve mike twice fix leaks pex pipes first time...
48613    love new location lauren always right point lo...
48614    completely ruined hair today attempting highli...
48615    going lauren years would never see another hai...
48616    always available need lauren rocks particular ...
Name: text, Length: 48617, dtype: object
{'having', "hadn't", "you'll", 't', 'under', "weren't", 'during', 'both', 'further', 'm', 'been', 'y', 'itself', 'same', 'wouldn', 'from', 'such', 'an', 'doesn', 'was', 'to', 'at', 'some', 'in', 'who', 'be', 'what', 'or', 'as', 'll', 'myself', 's', 'how', "don't", 'won', 'few', 'of', 'is', 'hadn', 'me', 'here

#### Stemming

In [28]:
from nltk.stem.porter import PorterStemmer

stemmer = PorterStemmer()

def stem_words(text):
    return " ".join([stemmer.stem(word) for word in text.split()])

reviewsCharlotte_prep= reviewsCharlotte_prep.apply(lambda text: stem_words(text))
reviewsCharlotte_prep

0        love custom servic fast doctor took time make ...
1        ive come coupl year optometrist busi busi yelp...
2        realli like optometrist wait littl longer woul...
3        robert team took great care repair car back ca...
4        edit im lower review 5star 3star wrote origin ...
                               ...                        
48612    weve mike twice fix leak pex pipe first time l...
48613    love new locat lauren alway right point love h...
48614    complet ruin hair today attempt highlight than...
48615    go lauren year would never see anoth hairdress...
48616    alway avail need lauren rock particular red ha...
Name: text, Length: 48617, dtype: object

# Topic Modeling

In [29]:
from gensim.models.ldamodel import LdaModel
from gensim.corpora.dictionary import Dictionary

### Sem pré-processamento

In [11]:
dataset_orig = [d.split() for d in reviewsCharlotte_original]
dictionary_orig = Dictionary(dataset_orig)
corpus_orig = [dictionary_orig.doc2bow(doc) for doc in dataset_orig]

model_original =LdaModel(corpus=corpus_orig, id2word=dictionary_orig, num_topics=10, iterations=100, passes=5,random_state=1)


In [31]:
for i in  model_original.show_topics(num_topics=10, num_words=6, log=False):
    print(i)
    print('---')


(0, '0.023*"games" + 0.010*"cookies" + 0.009*"Asked" + 0.005*"Rooms" + 0.005*"delivered." + 0.004*"purpose"')
---
(1, '0.040*"I" + 0.036*"to" + 0.033*"the" + 0.020*"a" + 0.017*"you" + 0.016*"that"')
---
(2, '0.021*"massage" + 0.008*"Taste" + 0.006*"sake" + 0.006*"Still," + 0.005*"Suites" + 0.005*"Avoid"')
---
(3, '0.005*"Ice" + 0.004*"mold" + 0.004*"Awful" + 0.003*"pleasure." + 0.003*"this?" + 0.003*"Cesar"')
---
(4, '0.050*"the" + 0.040*"and" + 0.039*"was" + 0.030*"I" + 0.026*"a" + 0.016*"The"')
---
(5, '0.015*"Thai" + 0.015*"ribs" + 0.008*"tire" + 0.007*"pork," + 0.007*"Taco" + 0.006*"Fantastic"')
---
(6, '0.012*"Hall" + 0.010*"Western" + 0.004*"tipped" + 0.004*"Yeah," + 0.003*"watch." + 0.002*":)."')
---
(7, '0.045*"hair" + 0.020*"salon" + 0.011*"cut" + 0.010*"color" + 0.010*"polish" + 0.008*"pedicure"')
---
(8, '0.046*"the" + 0.043*"and" + 0.039*"a" + 0.029*"is" + 0.023*"to" + 0.021*"of"')
---
(9, '0.046*"the" + 0.044*"and" + 0.041*"to" + 0.040*"I" + 0.038*"was" + 0.022*"a"')
---


### Com pré-processamento

In [14]:
dataset_prep = [d.split() for d in reviewsCharlotte_prep]
dictionary_prep = Dictionary(dataset_prep)
corpus_prep = [dictionary_prep.doc2bow(doc) for doc in dataset_prep]

model_prep =LdaModel(corpus=corpus_prep, id2word=dictionary_prep, num_topics=10, iterations=100, passes=5,random_state=1)


In [32]:
for i in  model_prep.show_topics(num_topics=10, num_words=6, log=False):
    print(i)
    print('---')


(0, '0.024*"beer" + 0.014*"bar" + 0.013*"place" + 0.013*"great" + 0.012*"good" + 0.010*"nice"')
---
(1, '0.023*"order" + 0.023*"food" + 0.015*"time" + 0.015*"wait" + 0.014*"us" + 0.013*"minut"')
---
(2, '0.021*"breakfast" + 0.018*"coffe" + 0.017*"egg" + 0.017*"brunch" + 0.014*"ice" + 0.013*"cream"')
---
(3, '0.044*"great" + 0.035*"food" + 0.031*"place" + 0.025*"servic" + 0.019*"good" + 0.018*"love"')
---
(4, '0.022*"call" + 0.016*"would" + 0.012*"told" + 0.011*"custom" + 0.011*"said" + 0.010*"servic"')
---
(5, '0.043*"sub" + 0.027*"gift" + 0.023*"japanes" + 0.022*"groceri" + 0.021*"jerk" + 0.018*"fuel"')
---
(6, '0.020*"good" + 0.019*"burger" + 0.015*"chicken" + 0.014*"fri" + 0.014*"order" + 0.011*"like"')
---
(7, '0.016*"like" + 0.014*"get" + 0.012*"go" + 0.010*"im" + 0.010*"look" + 0.010*"place"')
---
(8, '0.123*"airport" + 0.083*"room" + 0.072*"hotel" + 0.053*"stay" + 0.050*"taco" + 0.030*"flight"')
---
(9, '0.015*"car" + 0.014*"help" + 0.011*"work" + 0.011*"store" + 0.010*"need" + 

### Probabilidades dos tópicos em cada documento

In [17]:
docsProbabilities = model_prep.get_document_topics(corpus_prep, minimum_probability=0)

docsProbabilities

<gensim.interfaces.TransformedCorpus at 0x7fe0b9785cd0>

In [18]:
#Probabilidade de cada tópico por documento. Tupla (topico, probabilidade)
for docDistribution in docsProbabilities[0:10]:
    print(docDistribution)

[(0, 0.0071519343), (1, 0.15872245), (2, 0.007151945), (3, 0.25934404), (4, 0.00715396), (5, 0.0071514593), (6, 0.007151999), (7, 0.007152517), (8, 0.0071514593), (9, 0.5318682)]
[(0, 0.0016159642), (1, 0.0016160049), (2, 0.0016156161), (3, 0.001615824), (4, 0.20321089), (5, 0.0016154525), (6, 0.1409837), (7, 0.001615989), (8, 0.0016155083), (9, 0.64449507)]
[(0, 0.001852898), (1, 0.00185335), (2, 0.0018526925), (3, 0.0018531331), (4, 0.0018529615), (5, 0.0018524572), (6, 0.0018528634), (7, 0.5231479), (8, 0.0018524554), (9, 0.46202928)]
[(0, 0.00294224), (1, 0.0029428527), (2, 0.0029422138), (3, 0.0029421574), (4, 0.11584241), (5, 0.0029418338), (6, 0.0029423486), (7, 0.0029422585), (8, 0.0029418422), (9, 0.86061984)]
[(0, 0.08094109), (1, 0.00048701806), (2, 0.00048694856), (3, 0.0004870454), (4, 0.5226361), (5, 0.005358452), (6, 0.029970376), (7, 0.00048702172), (8, 0.00048686858), (9, 0.35865906)]
[(0, 0.33742812), (1, 0.007703148), (2, 0.0077025583), (3, 0.31229982), (4, 0.0077042

# Análise de Sentimentos

### Utilizando o VADER

In [12]:
from nltk.sentiment import SentimentIntensityAnalyzer

sent = SentimentIntensityAnalyzer()

print(sent.polarity_scores("Yes! Data mining is very powerful!"))

{'neg': 0.0, 'neu': 0.385, 'pos': 0.615, 'compound': 0.7489}


A pontuação composta (compound) é calculada de tal maneira que representa a polaridade de -1 a 1, onde +1 é o mais positivo e -1 é o mais negativo.

### Utilizando o Textblob

In [1]:
!pip install -U textblob

Collecting textblob
  Using cached textblob-0.17.1-py2.py3-none-any.whl (636 kB)
Installing collected packages: textblob
Successfully installed textblob-0.17.1


In [2]:
from textblob import TextBlob

frase = TextBlob("Yes! Data mining is very powerful!")
frase.sentiment

Sentiment(polarity=0.48750000000000004, subjectivity=1.0)

### Exemplo com os reviews do Yelp

In [26]:
def verifica_MuitoPositivo(review):
    
    return sent.polarity_scores(review)["compound"] > 0.8


for review in reviewsCharlotte_original[0:10]:
    
    if verifica_MuitoPositivo(review):
        print(review)
        print('----')

I loved the customer service! Fast, but the doctor took her time to make sure I was happy in my contact lenses. Thank you!
----
Robert and his team took great care of me when repairing my car back up camera. The team was able to identify the issue and resolve it for a low cost option. The team could have taken advantage of me and oversold equipment that wasn't needed, however they were very upfront about the root cause of the issue and I was very pleased with the repair and the repair cost !
----
Edit: I'm lowering this review from 5-stars to 3-stars.  I wrote the original review prematurely. After having been around inside my car a bit I have noticed broken rivets & clips and pieces of carpet were never put back where they should have been. Also, many leftover pieces of wire and debris were left in the crevices and sides of the seat. My biggest issue is every time I turn my car on I hear a pop come from the JL subwoofer that was bought and installed at Freeman's. I brought this up to 

# Transformers - Frases similares com sBert

In [2]:
#!pip install -U sentence-transformers
from sentence_transformers import SentenceTransformer, util

model_RobLarge = SentenceTransformer('all-MiniLM-L6-v2', device='cuda')


Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/10.2k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

In [5]:
dfTweetsTrump = pd.read_csv("tweets_trump.csv")

#Usando somente com os 10000 primeiros -
embeddingsTweets_ROBbase = model_RobLarge.encode(dfTweetsTrump.loc[0:10000,'text'], convert_to_tensor=True)

In [6]:
embeddingsTweets_ROBbase
matSimi = cosine_similarity(embeddingsTweets_ROBbase.cpu().detach().numpy())

In [13]:
maior = 0
count = 0
docID = -1
docTarget = 120

for i in matSimi[docTarget,:]:
    if count == docTarget:
        count +=1
        continue
        
    if i> maior:
        maior = i
        docID = count
    
    count+=1

In [14]:
print('****target***')
print(dfTweetsTrump.loc[docTarget,'text'])
print()
print("*** mais similar ***")
print(dfTweetsTrump.loc[docID,'text'])

print(maior)
print(docID)

****target***
RT @Claire_FOX5: #BREAKING:  @GaSecofState's office confirms Floyd County has found 2,600 ballots during audit.  Says Sec. Raffensperger wa…

*** mais similar ***
Investigators Dispatched After Fulton County Discovers ‘Issue‘ with Ballot Reporting https://t.co/lShmKksQ0O via @BreitbartNews
0.61102223
2582


## Outros exemplos com Transformers

### Classificação - zero-shot

In [5]:
from transformers import pipeline

classifier = pipeline("zero-shot-classification", model='valhalla/distilbart-mnli-12-1')


In [7]:
classifier(
    ["This is the Data Mining course from CPGEI",
     "I have a very good stock option, gonna be rich!"],
    
    candidate_labels=["education", "politics", "finance", "computer"],
)

[{'sequence': 'This is the Data Mining course from CPGEI',
  'labels': ['computer', 'education', 'finance', 'politics'],
  'scores': [0.5431435108184814,
   0.35852858424186707,
   0.058774080127477646,
   0.039553768932819366]},
 {'sequence': 'I have a very good stock option, gonna be rich!',
  'labels': ['finance', 'computer', 'education', 'politics'],
  'scores': [0.882773756980896,
   0.04628705978393555,
   0.046192556619644165,
   0.02474653534591198]}]

### NER - Named Entity Recognition

In [18]:
from transformers import pipeline

ner = pipeline("ner", grouped_entities=True)



No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english)


Downloading:   0%|          | 0.00/998 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.24G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/60.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]



In [19]:
ner("I am Thiago and a teach Data Mining at CPGEI.")

[{'entity_group': 'PER',
  'score': 0.9912441,
  'word': 'Thiago',
  'start': 5,
  'end': 11},
 {'entity_group': 'ORG',
  'score': 0.9959862,
  'word': 'CPGEI',
  'start': 39,
  'end': 44}]

In [None]:
## exemplos de transformers: https://huggingface.co/models

In [None]:
## spacy transformers: https://spacy.io/usage/embeddings-transformers#transformers