# Importando Bibliotecas

In [34]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import pandas as pd
import re
import string
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import  TfidfVectorizer
from numpy import dot
from numpy.linalg import norm


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


#Tratamento da Base

In [35]:
#Base

frases = ['Deus, para a felicidade do homem, inventou a fé o amor e amizade. O Diabo, invejoso, fez o homem confundir fé com religião e amor com casamento.',
          'Não é amigo aquele que alardeia a amizade: é traficante; a amizade sente-se, não se diz',
          'Eu não sou homem que recuse elogios. Amo-os; eles fazem bem à alma e até ao corpo. As melhores digestões da minha vida são as dos jantares em que sou brindado.',
          'Quem tem um amigo, mesmo que um só, não importa onde se encontre, jamais sofrerá de solidão; poderá morrer de saudades, mas não estará só. A verdadeira amizade é aquela que nos permite falar, ao amigo, de todos os seus defeitos e de todas as nossas qualidades.',
          ' A amizade é um amor que nunca morre. A amizade desenvolve a felicidade e reduz o sofrimento, duplicando a nossa alegria e dividindo a nossa dor']
df = pd.DataFrame(columns=['frases'])
df['frases'] = frases


In [36]:
# Etapas de pré-processamento de texto - remove números, letras maiúsculas e pontuação

alphanumeric = lambda x: re.sub(r"""\w*\d\w*""", ' ', x)
punc_lower = lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', x.lower())

df['frases'] = df.frases.map(alphanumeric).map(punc_lower)
df.head()

Unnamed: 0,frases
0,deus para a felicidade do homem inventou a f...
1,não é amigo aquele que alardeia a amizade é t...
2,eu não sou homem que recuse elogios amo os e...
3,quem tem um amigo mesmo que um só não import...
4,a amizade é um amor que nunca morre a amizad...


#Bag of Words

In [37]:
#Bag of words

cv = CountVectorizer(stop_words = stopwords.words('portuguese'))
X = cv.fit_transform(df.frases).toarray()

dt = pd.DataFrame(X, columns=cv.get_feature_names())
dt.head()



Unnamed: 0,alardeia,alegria,alma,amigo,amizade,amo,amor,bem,brindado,casamento,confundir,corpo,defeitos,desenvolve,deus,diabo,digestões,dividindo,diz,dor,duplicando,elogios,encontre,estará,falar,fazem,felicidade,fez,fé,homem,importa,invejoso,inventou,jamais,jantares,melhores,morre,morrer,nunca,onde,permite,poderá,qualidades,recuse,reduz,religião,saudades,sente,sofrerá,sofrimento,solidão,todas,todos,traficante,verdadeira,vida
0,0,0,0,0,1,0,2,0,0,1,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,1,2,2,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,1,0,0,1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0
2,0,0,1,0,0,1,0,1,1,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1
3,0,0,0,2,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,1,0,0,1,0,0,0,1,0,1,1,1,1,0,0,0,1,0,1,0,1,1,1,0,1,0
4,0,1,0,0,2,0,1,0,0,0,0,0,0,1,0,0,0,1,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0


In [43]:
# Para o desafio será explorado somente a frase 1. No entanto, é análogo se desejar replicar para os demais.
# No formato Count Vectorizer

frase1 = list(dt.loc[0])

In [38]:
#similaridade do cosseno entre frases

cosine = lambda v1, v2: dot(v1, v2) / (norm(v1) * norm(v2))

In [47]:
# Calculando todas as semelhanças e classificando as mais semelhantes 

similarity = [cosine(frase1, frases) for frases in X]
sorted(list(zip(similarity, frases)), reverse=True)[1:5]

[(0.2752409412815901,
  '\xa0A\xa0amizade\xa0é um amor que nunca morre. A\xa0amizade\xa0desenvolve a felicidade e reduz o sofrimento, duplicando a nossa alegria e dividindo a nossa dor'),
 (0.1421338109037403,
  'Não é amigo aquele que alardeia a amizade: é traficante; a amizade sente-se, não se diz'),
 (0.11826247919781652,
  'Eu não sou homem que recuse elogios. Amo-os; eles fazem bem à alma e até ao corpo. As melhores digestões da minha vida são as dos jantares em que sou brindado.'),
 (0.045454545454545456,
  'Quem tem um\xa0amigo, mesmo que um só, não importa onde se encontre, jamais sofrerá de solidão; poderá morrer de saudades, mas não estará só. A verdadeira amizade é aquela que nos permite falar, ao\xa0amigo, de todos os seus defeitos e de todas as nossas qualidades.')]

#Tf_IDF

In [49]:
#TF-IDF
          
cv_tfidf = TfidfVectorizer(stop_words = stopwords.words('portuguese'))
X_tfidf = cv_tfidf.fit_transform(df.frases).toarray()

dt_tfidf = pd.DataFrame(X_tfidf, columns=cv_tfidf.get_feature_names())
dt_tfidf.head()



Unnamed: 0,alardeia,alegria,alma,amigo,amizade,amo,amor,bem,brindado,casamento,confundir,corpo,defeitos,desenvolve,deus,diabo,digestões,dividindo,diz,dor,duplicando,elogios,encontre,estará,falar,fazem,felicidade,fez,fé,homem,importa,invejoso,inventou,jamais,jantares,melhores,morre,morrer,nunca,onde,permite,poderá,qualidades,recuse,reduz,religião,saudades,sente,sofrerá,sofrimento,solidão,todas,todos,traficante,verdadeira,vida
0,0.0,0.0,0.0,0.0,0.132147,0.0,0.378484,0.0,0.0,0.234561,0.234561,0.0,0.0,0.0,0.234561,0.234561,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.189242,0.234561,0.469121,0.378484,0.0,0.234561,0.234561,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.234561,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.41098,0.0,0.0,0.331576,0.463077,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.41098,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.41098,0.0,0.0,0.0,0.0,0.0,0.41098,0.0,0.0
2,0.0,0.0,0.281151,0.0,0.0,0.281151,0.0,0.281151,0.281151,0.0,0.0,0.281151,0.0,0.0,0.0,0.0,0.281151,0.0,0.0,0.0,0.0,0.281151,0.0,0.0,0.0,0.281151,0.0,0.0,0.0,0.226831,0.0,0.0,0.0,0.0,0.281151,0.281151,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.281151,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.281151
3,0.0,0.0,0.0,0.361523,0.126226,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.224049,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.224049,0.224049,0.224049,0.0,0.0,0.0,0.0,0.0,0.224049,0.0,0.0,0.224049,0.0,0.0,0.0,0.224049,0.0,0.224049,0.224049,0.224049,0.224049,0.0,0.0,0.0,0.224049,0.0,0.224049,0.0,0.224049,0.224049,0.224049,0.0,0.224049,0.0
4,0.0,0.293972,0.0,0.0,0.331238,0.0,0.237175,0.0,0.0,0.0,0.0,0.0,0.0,0.293972,0.0,0.0,0.0,0.293972,0.0,0.293972,0.293972,0.0,0.0,0.0,0.0,0.0,0.237175,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.293972,0.0,0.293972,0.0,0.0,0.0,0.0,0.0,0.293972,0.0,0.0,0.0,0.0,0.293972,0.0,0.0,0.0,0.0,0.0,0.0


In [51]:
# Para o desafio será explorado somente a frase 1. No entanto, é análogo se desejar replicar para os demais.

frase1 = list(dt_tfidf.loc[0])

In [52]:
# Calculando todas as semelhanças e classificando as mais semelhantes usando os dados do TF-IDF

similarity_tfidf = [cosine(frase1, frases) for frases in X_tfidf]
sorted(list(zip(similarity_tfidf, frases)), reverse=True)[1:5]

[(0.1784226405296565,
  '\xa0A\xa0amizade\xa0é um amor que nunca morre. A\xa0amizade\xa0desenvolve a felicidade e reduz o sofrimento, duplicando a nossa alegria e dividindo a nossa dor'),
 (0.08585173912563734,
  'Eu não sou homem que recuse elogios. Amo-os; eles fazem bem à alma e até ao corpo. As melhores digestões da minha vida são as dos jantares em que sou brindado.'),
 (0.061194460241091,
  'Não é amigo aquele que alardeia a amizade: é traficante; a amizade sente-se, não se diz'),
 (0.016680367456498174,
  'Quem tem um\xa0amigo, mesmo que um só, não importa onde se encontre, jamais sofrerá de solidão; poderá morrer de saudades, mas não estará só. A verdadeira amizade é aquela que nos permite falar, ao\xa0amigo, de todos os seus defeitos e de todas as nossas qualidades.')]

Em ambos os métodos (bag of words e tf-idf) a frase mais próxima de "Deus, para a felicidade do homem, inventou a fé o amor e amizade. O Diabo, invejoso, fez o homem confundir fé com religião e amor com casamento." foi "A amizade é um amor que nunca morre. A amizade desenvolve a felicidade e reduz o sofrimento, duplicando a nossa alegria e dividindo a nossa dor"