<a href="https://colab.research.google.com/github/ABMHub/NLP/blob/main/T2/preprocess2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Inicialização

In [31]:
import pandas as pd
import numpy as np
import seaborn as sns
import re
import nltk
from nltk.stem.porter import *
from typing import List
import matplotlib.pyplot as plt

nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Aula Feature Engineering

In [32]:
corpus = [
          "Batatinha quando nasce esparrama pelo chão igual batatinha",
          "A pior experiência da minha vida",
          "Quero meu dinheiro de volta pois é meu e é meu",
          "A experiência do dinheiro esparrama minha vida",
          "Açaí é a melhor coisa da vida"
]

corpus

['Batatinha quando nasce esparrama pelo chão igual batatinha',
 'A pior experiência da minha vida',
 'Quero meu dinheiro de volta pois é meu e é meu',
 'A experiência do dinheiro esparrama minha vida',
 'Açaí é a melhor coisa da vida']

In [33]:
frasezona = " ".join(corpus).lower()
frasezona

'batatinha quando nasce esparrama pelo chão igual batatinha a pior experiência da minha vida quero meu dinheiro de volta pois é meu e é meu a experiência do dinheiro esparrama minha vida açaí é a melhor coisa da vida'

In [34]:
tokens = frasezona.split()
tokens

['batatinha',
 'quando',
 'nasce',
 'esparrama',
 'pelo',
 'chão',
 'igual',
 'batatinha',
 'a',
 'pior',
 'experiência',
 'da',
 'minha',
 'vida',
 'quero',
 'meu',
 'dinheiro',
 'de',
 'volta',
 'pois',
 'é',
 'meu',
 'e',
 'é',
 'meu',
 'a',
 'experiência',
 'do',
 'dinheiro',
 'esparrama',
 'minha',
 'vida',
 'açaí',
 'é',
 'a',
 'melhor',
 'coisa',
 'da',
 'vida']

In [35]:
vocab = pd.Series(list(set(tokens)))
vocab.sort_values()

6               a
24           açaí
9       batatinha
16           chão
22          coisa
19             da
12             de
21       dinheiro
3              do
8               e
4       esparrama
7     experiência
17          igual
10         melhor
20            meu
0           minha
5           nasce
2            pelo
13           pior
15           pois
1          quando
14          quero
23           vida
18          volta
11              é
dtype: object

In [36]:
bow = dict()
for i in range(len(corpus)):
  bow[i] = corpus[i].lower().split()

bow

{0: ['batatinha',
  'quando',
  'nasce',
  'esparrama',
  'pelo',
  'chão',
  'igual',
  'batatinha'],
 1: ['a', 'pior', 'experiência', 'da', 'minha', 'vida'],
 2: ['quero',
  'meu',
  'dinheiro',
  'de',
  'volta',
  'pois',
  'é',
  'meu',
  'e',
  'é',
  'meu'],
 3: ['a', 'experiência', 'do', 'dinheiro', 'esparrama', 'minha', 'vida'],
 4: ['açaí', 'é', 'a', 'melhor', 'coisa', 'da', 'vida']}

Bag of Words - Binário
0 caso não esteja no texto, 1 caso esteja

In [37]:
bow_binario = dict()

for i, sent in enumerate(corpus):
  bow_binario[i] = dict()
  for wrd in sent.lower().split():
    bow_binario[i][wrd] = 1

df_bin = pd.DataFrame().from_records(bow_binario).T.fillna(0)
df_bin = df_bin.astype(int)
df_bin
# plt.figure(figsize=(21, 4))
# sns.heatmap(df_bin)


Unnamed: 0,batatinha,quando,nasce,esparrama,pelo,chão,igual,a,pior,experiência,...,dinheiro,de,volta,pois,é,e,do,açaí,melhor,coisa
0,1,1,1,1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,0,0,0,0
3,0,0,0,1,0,0,0,1,0,1,...,1,0,0,0,0,0,1,0,0,0
4,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,1,1,1


Bag of words - contagem
Conta quantas vezes cada palavra apareceu no texto

In [38]:
bow_contagem = dict()

for i, sent in enumerate(corpus):
  bow_contagem[i] = dict()
  for wrd in sent.lower().split():
    if wrd in bow_contagem[i].keys():
      bow_contagem[i][wrd] += 1
    else:
      bow_contagem[i][wrd] = 1

df_contagem = pd.DataFrame().from_records(bow_contagem).T.fillna(0)
df_contagem = df_contagem.astype(int)
df_contagem

Unnamed: 0,batatinha,quando,nasce,esparrama,pelo,chão,igual,a,pior,experiência,...,dinheiro,de,volta,pois,é,e,do,açaí,melhor,coisa
0,2,1,1,1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,2,1,0,0,0,0
3,0,0,0,1,0,0,0,1,0,1,...,1,0,0,0,0,0,1,0,0,0
4,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,1,1,1


In [39]:
bow_ponderado = dict()
N = dict()

for i, sent in enumerate(corpus):
  bow_ponderado[i] = dict()
  sent_pos_proc = sent.lower().split()
  siz = len(sent_pos_proc)
  for wrd in sent_pos_proc:
    if wrd in bow_ponderado[i].keys():
      bow_ponderado[i][wrd] += 1/siz
    else:
      bow_ponderado[i][wrd] = 1/siz

bow_ponderado

{0: {'batatinha': 0.25,
  'chão': 0.125,
  'esparrama': 0.125,
  'igual': 0.125,
  'nasce': 0.125,
  'pelo': 0.125,
  'quando': 0.125},
 1: {'a': 0.16666666666666666,
  'da': 0.16666666666666666,
  'experiência': 0.16666666666666666,
  'minha': 0.16666666666666666,
  'pior': 0.16666666666666666,
  'vida': 0.16666666666666666},
 2: {'de': 0.09090909090909091,
  'dinheiro': 0.09090909090909091,
  'e': 0.09090909090909091,
  'meu': 0.2727272727272727,
  'pois': 0.09090909090909091,
  'quero': 0.09090909090909091,
  'volta': 0.09090909090909091,
  'é': 0.18181818181818182},
 3: {'a': 0.14285714285714285,
  'dinheiro': 0.14285714285714285,
  'do': 0.14285714285714285,
  'esparrama': 0.14285714285714285,
  'experiência': 0.14285714285714285,
  'minha': 0.14285714285714285,
  'vida': 0.14285714285714285},
 4: {'a': 0.14285714285714285,
  'açaí': 0.14285714285714285,
  'coisa': 0.14285714285714285,
  'da': 0.14285714285714285,
  'melhor': 0.14285714285714285,
  'vida': 0.14285714285714285,
  '

Similaridade de textos: comparação de cosseno

In [68]:
def norma(vec):
  norma = 0
  for x in vec:
    norma += x*x

  return norma ** 0.5

def similaridade_de_coseno(vec1, vec2):
  prod_interno = 0
  for i in range(len(vec1)):
    prod_interno += vec1[i] * vec2[i]

  norma1 = norma(vec1)
  norma2 = norma(vec2)

  print(prod_interno, norma1, norma2)

  return prod_interno/(norma1*norma2)

In [69]:
similaridade_de_coseno([0,0,1,1], [0, 0, 1, 1])

2 1.4142135623730951 1.4142135623730951


0.9999999999999998

In [60]:
document_frequency = dict()

for i, sent in enumerate(corpus):
  for wrd in set(sent.lower().split()):
    if wrd not in document_frequency:
      document_frequency[wrd] = 1
    else:
      document_frequency[wrd] += 1

s = pd.Series(document_frequency)
s = np.log(len(corpus) / s)
s

batatinha      1.609438
esparrama      0.916291
quando         1.609438
nasce          1.609438
pelo           1.609438
chão           1.609438
igual          1.609438
da             0.916291
minha          0.916291
pior           1.609438
vida           0.510826
a              0.510826
experiência    0.916291
e              1.609438
meu            1.609438
é              0.916291
dinheiro       0.916291
de             1.609438
quero          1.609438
pois           1.609438
volta          1.609438
do             1.609438
melhor         1.609438
coisa          1.609438
açaí           1.609438
dtype: float64

In [63]:
df_term_frequency = pd.DataFrame().from_records(bow_ponderado).fillna(0).T
df_term_frequency

Unnamed: 0,batatinha,quando,nasce,esparrama,pelo,chão,igual,a,pior,experiência,...,dinheiro,de,volta,pois,é,e,do,açaí,melhor,coisa
0,0.25,0.125,0.125,0.125,0.125,0.125,0.125,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166667,0.166667,0.166667,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.090909,0.090909,0.090909,0.090909,0.181818,0.090909,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.142857,0.0,0.0,0.0,0.142857,0.0,0.142857,...,0.142857,0.0,0.0,0.0,0.0,0.0,0.142857,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.142857,0.0,0.0,...,0.0,0.0,0.0,0.0,0.142857,0.0,0.0,0.142857,0.142857,0.142857


In [67]:
for a in s.index:
  df_term_frequency[a] *= s[a]

df_term_frequency

Unnamed: 0,batatinha,quando,nasce,esparrama,pelo,chão,igual,a,pior,experiência,...,dinheiro,de,volta,pois,é,e,do,açaí,melhor,coisa
0,0.402359,0.20118,0.20118,0.114536,0.20118,0.20118,0.20118,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.085138,0.26824,0.152715,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.083299,0.146313,0.146313,0.146313,0.166598,0.146313,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.130899,0.0,0.0,0.0,0.072975,0.0,0.130899,...,0.130899,0.0,0.0,0.0,0.0,0.0,0.22992,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.072975,0.0,0.0,...,0.0,0.0,0.0,0.0,0.130899,0.0,0.0,0.22992,0.22992,0.22992


In [71]:
similaridade_de_coseno(df_term_frequency.iloc[0], df_term_frequency.iloc[3])

0.01499265545211562 0.6143111265263488 0.3633891165422351


0.06716117150546966