<img src="https://github.com/FIUBA-Posgrado-Inteligencia-Artificial/procesamiento_lenguaje_natural/raw/main/logoFIUBA.jpg" width="500" align="center">


# Procesamiento de lenguaje natural
## Vectorización


In [16]:
import numpy as np
np.set_printoptions(threshold=np.inf, edgeitems=50,linewidth=200) # para mostrar mas elementos en print(np.array)

In [17]:
def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * (np.linalg.norm(b)))

### Datos

In [18]:
corpus = np.array(['que dia es hoy', 'martes el dia de hoy es martes', 'martes muchas gracias'])

Documento 1 --> que dia es hoy \
Documento 2 --> martes el dia de hoy es martes \
Documento 3 --> martes muchas gracias

### 1 - Obtener el vocabulario del corpus (los términos utilizados)
- Cada documento transformarlo en una lista de términos
- Armar un vector de términos no repetidos de todos los documentos

In [19]:
def get_unique_text(corpus : np.ndarray) -> np.ndarray:
    """
    Gets unique words from a corpus

    Args:
        corpus -> np.ndarray: array of corpus text.

    Returns:
        rtype -> np.ndarray: array with unique words
    """
    temp = []
    for phrase in corpus:
        temp.append(phrase.lower().split())

    vector = np.hstack(temp)
    uniques = np.unique(vector) 
    return vector , uniques

In [20]:
text, unique_text = get_unique_text(corpus)
print(text)
print(unique_text)

['que' 'dia' 'es' 'hoy' 'martes' 'el' 'dia' 'de' 'hoy' 'es' 'martes' 'martes' 'muchas' 'gracias']
['de' 'dia' 'el' 'es' 'gracias' 'hoy' 'martes' 'muchas' 'que']


### 2- OneHot encoding
Data una lista de textos, devolver una matriz con la representación oneHotEncoding de estos

In [21]:
def one_hot_encoding(text_list : np.ndarray, show_columns = False)-> np.ndarray:
    """
    Performs one-hot encoding onto an array of texts.

    Args:
        text_list : np.ndarray -> array of texts.
        show_columns : bool -> whether to show or not the columns of the one-hot encoding matrix
    
    Returns:
        np.ndarray -> one-hot encoding matrix

    """
    unique, inverse = np.unique(text_list, return_inverse=True)
    onehot = np.eye(unique.shape[0])[inverse]
    if show_columns:
        print(unique)
    return onehot

In [22]:
one_hot_encoding(text, True)

['de' 'dia' 'el' 'es' 'gracias' 'hoy' 'martes' 'muchas' 'que']


array([[0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0.]])

### 3- Vectores de frecuencia
Data una lista de textos, devolver una matriz con la representación de frecuencia de estos

In [23]:
def get_frequency_matrix(corpus:np.ndarray, unique_text:np.ndarray, show_columns = False) -> np.ndarray:
    """
    Creates a matrix, representing the frequency of every word within a corpus.

    Args:
        corpus : np.ndarray -> corpus of text.
        unique_text : np.ndarray -> list of unique words.
        show_columns : bool -> whether to print or not the columns of the matrix.
    Returns:
        np.ndarray -> frequency matrix.
    """
    frequency_matrix = np.zeros((corpus.size,unique_text.size))
    for i,word  in enumerate(corpus):
        for  j,unique_word in enumerate(unique_text):
            
            if unique_word in word.lower().split():
                
                frequency_matrix[i,j] = word.lower().split().count(unique_word)
    if show_columns:
        print("columns:",unique_text)
    return frequency_matrix

In [24]:
corpus

array(['que dia es hoy', 'martes el dia de hoy es martes', 'martes muchas gracias'], dtype='<U30')

In [25]:
unique_text

array(['de', 'dia', 'el', 'es', 'gracias', 'hoy', 'martes', 'muchas', 'que'], dtype='<U7')

In [26]:
get_frequency_matrix(corpus, unique_text, True)

columns: ['de' 'dia' 'el' 'es' 'gracias' 'hoy' 'martes' 'muchas' 'que']


array([[0., 1., 0., 1., 0., 1., 0., 0., 1.],
       [1., 1., 1., 1., 0., 1., 2., 0., 0.],
       [0., 0., 0., 0., 1., 0., 1., 1., 0.]])

### 4- TF-IDF
Data una lista de textos, devolver una matriz con la representacion TFIDF

In [31]:
def get_tf_idf(corpus : np.ndarray, show_columns = False)-> np.ndarray:
    """
    Applies TF-IDF vector representation onto a corpus of text.
    
    Args:
        corpus : np.ndarray -> corpus of text.
        show_columns : bool -> whether to show or not the matrix columns
    """
    text, unique_text = get_unique_text(corpus)
    n = np.sum(one_hot_encoding(text),axis =0)

    idf = np.log10(corpus.size/n)
    tf = get_frequency_matrix(corpus,unique_text)
    if show_columns:
        print(unique_text)
    return tf*idf
    

In [28]:
get_tf_idf(corpus, True)

['de' 'dia' 'el' 'es' 'gracias' 'hoy' 'martes' 'muchas' 'que']


array([[0.        , 0.17609126, 0.        , 0.17609126, 0.        , 0.17609126, 0.        , 0.        , 0.47712125],
       [0.47712125, 0.17609126, 0.47712125, 0.17609126, 0.        , 0.17609126, 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.47712125, 0.        , 0.        , 0.47712125, 0.        ]])

### 5 - Comparación de documentos
Realizar una funcion que reciba el corpus y el índice de un documento y devuelva los documentos ordenados por la similitud coseno

In [29]:
def order_by_cosine(corpus: np.ndarray,index : int, show_similarities = False):
    """
    Applies the cosine similarity between a indexed text(target) and all of the corpus,
    returns the corpus ordered by similarities.

    Args:
        corpus : np.ndarray -> corpus of documents.
        index : int -> target vector
        show_similarities : bool -> whether to show or not the similarity vector.
    
    """
    similarity = []
    tf_idf = get_tf_idf(corpus)
    try:
        target = tf_idf[index]
        for i in tf_idf:

            similarity.append(cosine_similarity(i,target))
        similarity = np.array(similarity)
        if show_similarities:
            print(similarity[similarity.argsort()[::-1]])
        return(corpus[similarity.argsort()[::-1]])
    except Exception as err:
        print(err)


In [30]:
order_by_cosine(corpus, 1, True)

[1.         0.22184708 0.        ]


array(['martes el dia de hoy es martes', 'que dia es hoy', 'martes muchas gracias'], dtype='<U30')