# Procesamiento de lenguaje natural
## Word2vect


In [1]:
import numpy as np
import pandas as pd

In [2]:
def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * (np.linalg.norm(b)))

### Datos

In [3]:
corpus = np.array(['que dia es hoy', 'martes el dia de hoy es martes', 'martes muchas gracias'])

### 1 - Obtener el vocabulario del corpus (los términos utilizados)
- Cada documento transformarlo en una lista de términos
- Armar un vector de términos no repetidos de todos los documentos

In [4]:
voc = []

for i, doc in enumerate(corpus):
    sentence = doc.split()
    print(f"Documento {i+1}: {sentence}")
    for word in sentence:
        if word not in voc:
            voc.append(word)
print()
print(f"Vocabulario: {voc}")

Documento 1: ['que', 'dia', 'es', 'hoy']
Documento 2: ['martes', 'el', 'dia', 'de', 'hoy', 'es', 'martes']
Documento 3: ['martes', 'muchas', 'gracias']

Vocabulario: ['que', 'dia', 'es', 'hoy', 'martes', 'el', 'de', 'muchas', 'gracias']


### 2- OneHot encoding
Data una lista de textos, devolver una matriz con la representación oneHotEncoding de estos

In [5]:
# Dimensiones de OHE 3 filas (1 por documento) por 9 columnas (dimension del vocabulario)
n_doc = len(corpus)
n_voc = len(voc)

In [6]:
ohe = np.zeros((n_doc,n_voc))

for j, doc in enumerate(corpus):
    for i, word in enumerate(voc):
        if voc[i] in doc:
            ohe[j, i] += 1
            
print(ohe)

[[1. 1. 1. 1. 0. 0. 0. 0. 0.]
 [0. 1. 1. 1. 1. 1. 1. 0. 0.]
 [0. 0. 1. 0. 1. 0. 0. 1. 1.]]


### 3- Vectores de frecuencia
Data una lista de textos, devolver una matriz con la representación de frecuencia de estos

In [7]:
#TF

tf = np.zeros((n_doc,n_voc))

for j, doc in enumerate(corpus):
    sentence = doc.split()
    print(sentence)
    for i, word in enumerate(sentence):
        for k in range(len(voc)):
            if word == voc[k]:
                tf[j, k] += 1

tf_df = pd.DataFrame(tf,index=["Doc 1", "Doc 2", "Doc 3"], columns=voc)

#IDF
idf = np.zeros(n_voc)

for i, word in enumerate(voc):
    n=0
    for doc in corpus:
        sentence = doc.split()
        if word in sentence:
            n += 1
    idf[i] = np.log10(n_doc/n)

idf_df = pd.DataFrame(idf.reshape(1,-1),index=["IDF"], columns=voc)

['que', 'dia', 'es', 'hoy']
['martes', 'el', 'dia', 'de', 'hoy', 'es', 'martes']
['martes', 'muchas', 'gracias']


In [8]:
tf_df

Unnamed: 0,que,dia,es,hoy,martes,el,de,muchas,gracias
Doc 1,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
Doc 2,0.0,1.0,1.0,1.0,2.0,1.0,1.0,0.0,0.0
Doc 3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0


In [9]:
idf_df

Unnamed: 0,que,dia,es,hoy,martes,el,de,muchas,gracias
IDF,0.477121,0.176091,0.176091,0.176091,0.176091,0.477121,0.477121,0.477121,0.477121


### 4- TF-IDF
Data una lista de textos, devolver una matriz con la representacion TFIDF

In [10]:
tf_idf =  np.zeros(tf.shape)

for i in range(tf_idf.shape[0]):
    tf_idf[i,:] = tf[i,:]*idf.reshape(1,-1)
    
tf_idf_df = pd.DataFrame(tf_idf,index=None, columns=voc)

In [11]:
tf_idf_df

Unnamed: 0,que,dia,es,hoy,martes,el,de,muchas,gracias
0,0.477121,0.176091,0.176091,0.176091,0.0,0.0,0.0,0.0,0.0
1,0.0,0.176091,0.176091,0.176091,0.352183,0.477121,0.477121,0.0,0.0
2,0.0,0.0,0.0,0.0,0.176091,0.0,0.0,0.477121,0.477121


### 5 - Comparación de documentos
Realizar una funcion que reciba el corpus y el índice de un documento y devuelva los documentos ordenados por la similitud coseno

In [12]:
print("Comparacion Documento 1 vs Documento 2")
print(f"Documento 1: {corpus[0]}")
print(f"Documento 2: {corpus[1]}")
print(f"Similitud coseno: {cosine_similarity(tf_idf[0,:], tf_idf[1,:])}")
print()
print("Comparacion Documento 1 vs Documento 3")
print(f"Documento 1: {corpus[0]}")
print(f"Documento 2: {corpus[2]}")
print(f"Similitud coseno: {cosine_similarity(tf_idf[0,:], tf_idf[2,:])}")
print()
print("Comparacion Documento 2 vs Documento 3")
print(f"Documento 1: {corpus[1]}")
print(f"Documento 2: {corpus[2]}")
print(f"Similitud coseno: {cosine_similarity(tf_idf[1,:], tf_idf[2,:])}")


Comparacion Documento 1 vs Documento 2
Documento 1: que dia es hoy
Documento 2: martes el dia de hoy es martes
Similitud coseno: 0.2003419026809871

Comparacion Documento 1 vs Documento 3
Documento 1: que dia es hoy
Documento 2: martes muchas gracias
Similitud coseno: 0.0

Comparacion Documento 2 vs Documento 3
Documento 1: martes el dia de hoy es martes
Documento 2: martes muchas gracias
Similitud coseno: 0.10845711727883083
