<img src="https://github.com/FIUBA-Posgrado-Inteligencia-Artificial/procesamiento_lenguaje_natural/raw/main/logoFIUBA.jpg" width="500" align="center">


# Natual Language Processing
## Vectorization


In [3]:
import numpy as np

In [4]:
def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * (np.linalg.norm(b)))

### Datos

In [5]:
corpus = np.array(
    ['que dia es hoy', 'martes el dia de hoy es martes', 'martes muchas gracias'])

Documento 1 --> que dia es hoy \
Documento 2 --> martes el dia de hoy es martes \
Documento 3 --> martes muchas gracias

### 1 - Obtener el vocabulario del corpus (los términos utilizados)
- Cada documento transformarlo en una lista de términos
- Armar un vector de términos no repetidos de todos los documentos

In [5]:
import nltk
nltk.download('punkt')


# List of words
document1 = "que dia es hoy"
document2 = "martes el dia de hoy es martes"
document3 = "martes muchas gracias"


def document2list(lst):
    return ([i for i in lst.split()])


print(document2list(document1))

print(document2list(document2))

document3_to_list = nltk.word_tokenize(document3)
print(document3_to_list)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\fSIoF\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


['que', 'dia', 'es', 'hoy']
['martes', 'el', 'dia', 'de', 'hoy', 'es', 'martes']
['martes', 'muchas', 'gracias']


In [7]:
# Transform text to list using python
document1_to_list = document1.split()
print(document1_to_list)
document2_to_list = document2.split()
print(document2_to_list)
document3_to_list = document3.split()
print(document3_to_list)

['que', 'dia', 'es', 'hoy']
['martes', 'el', 'dia', 'de', 'hoy', 'es', 'martes']
['martes', 'muchas', 'gracias']


In [11]:
from sklearn.feature_extraction.text import CountVectorizer

corpus = ["que dia es hoy",
          "martes el dia de hoy es martes",
          "martes muchas gracias"]

# Create an object of CountVectorizer class
vectorizer = CountVectorizer()

# Tokenize and buil vocabulary
vectorizer.fit(corpus)

# Encode
vector = vectorizer.transform(corpus)
print(vectorizer.vocabulary_)

{'que': 8, 'dia': 1, 'es': 3, 'hoy': 5, 'martes': 6, 'el': 2, 'de': 0, 'muchas': 7, 'gracias': 4}


In [8]:
import re
import string


def count_occurrence(str):
    pattern = r'[^a-zA-z0-9.,!?/:;\"\'\s]'
    str_modified = re.sub(pattern, '', str)
    string_modified = ''.join([c for c in str_modified if c not in string.punctuation])
    text = string_modified.split()
    counts = {}    
    for word in text:
        if counts.get(word) is None:
            counts[word] = 1
        else:
            counts[word]+= 1

    duplicated = {key:value for key, value in counts.items() if value==1}

    sorted_counts = dict(
        sorted(duplicated.items(), key=lambda item: item[1], reverse=True)
    )

    return sorted_counts


In [9]:
corpus = "que dia es hoy, martes el dia de hoy es martes, martes muchas gracias"
count_occurrence(corpus)

{'que': 1, 'el': 1, 'de': 1, 'muchas': 1, 'gracias': 1}

### 2- OneHot encoding
Dada una lista de textos, devolver una matriz con la representación oneHotEncoding de estos

In [31]:
# Function with one-hot-encoding
from sklearn.preprocessing import Binarizer

corpus = ['The cat sat on the mat.',
          'The dog chased the cat.',
           'CS224n at Stanford is the best NLP class you can ever take!']


freq = CountVectorizer()
corpus = freq.fit_transform(corpus)
one_hot_enconding = Binarizer()
matrix_words = one_hot_enconding.fit_transform(corpus.toarray())
print(matrix_words)

[[0 0 0 1 0 0 0 0 0 0 1 0 1 1 0 0 1 0]
 [0 0 0 1 1 0 0 1 0 0 0 0 0 0 0 0 1 0]
 [1 1 1 0 0 1 1 0 1 1 0 1 0 0 1 1 1 1]]


In [85]:
def one_hot_encoding(corpus):
    unique_words = set()
    # Convert the corpus in arrays of documents
    for document in corpus:
        for word in document.split():
            unique_words.add(word.lower())

    # Word to index 
    word_to_index = {}
    for i, word in  enumerate(unique_words):
        word_to_index[word] = i
    ohe_vector = []
    document_vectors = []

    for document in corpus:
        for word in document.split():
            vector = np.zeros(len(unique_words))
            vector[word_to_index[word.lower()]] = 1
            document_vectors.append(vector)
        ohe_vector.append(document_vectors)
    
    return ohe_vector

In [86]:
corpus = ['The cat sat on the mat.',
          'The dog chased the cat.',
           'CS224n at Stanford is the best NLP class you can ever take!']
one_hot_encoding(corpus)

[[array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
         0., 0.]),
  array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0.]),
  array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         1., 0.]),
  array([0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0.]),
  array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
         0., 0.]),
  array([0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0.]),
  array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
         0., 0.]),
  array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 1.]),
  array([0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0.]),
  array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
         0., 0.]),
  array([0., 0., 0., 0., 0., 0., 0., 0.,

### 3- Vectores de frecuencia
Dada una lista de textos, devolver una matriz con la representación de frecuencia de estos

In [35]:
from sklearn.feature_extraction.text import CountVectorizer

sentences = ['The cat sat on the mat.',
             'The dog chased the cat.',
             'CS224n at Stanford is the best NLP class you can ever take!']

vectorizer = CountVectorizer(lowercase=False)

vectorizer.fit(sentences)

print(vectorizer.vocabulary_)

print(vectorizer.transform(sentences).toarray())

{'The': 3, 'cat': 7, 'sat': 15, 'on': 14, 'the': 17, 'mat': 13, 'dog': 10, 'chased': 8, 'CS224n': 0, 'at': 4, 'Stanford': 2, 'is': 12, 'best': 5, 'NLP': 1, 'class': 9, 'you': 18, 'can': 6, 'ever': 11, 'take': 16}
[[0 0 0 1 0 0 0 1 0 0 0 0 0 1 1 1 0 1 0]
 [0 0 0 1 0 0 0 1 1 0 1 0 0 0 0 0 0 1 0]
 [1 1 1 0 1 1 1 0 0 1 0 1 1 0 0 0 1 1 1]]


In [91]:
list_documents=[]
list_text=[]
word_list=[]
word_dictionary = []

def frequency(corpus):
    for document in corpus:
        list_documents.append(document)
        pattern = r'[^a-zA-z0-9.,!?/:;\"\'\s]'
        for str in list_documents:
            str_modified = re.sub(pattern, '', str)
            string_modified = ''.join([c for c in str_modified if c not in string.punctuation])
            text = string_modified.split()
            for word in text:
                word_list.append(word)
        list_text.append(string_modified.lower().split())
        for i in word_list:
            if i not in word_dictionary:
                word_dictionary.append(i.lower())
        matrix_fq = np.zeros(len(word_dictionary))
        for words in word_dictionary:
            for j in range(len(list_documents)):
                if words in list_documents[j] and words in word_dictionary:
                    index = word_dictionary.index(words)
                    matrix_fq[index] += 1
        
    
    return  matrix_fq

In [92]:
sentences = ['The cat sat on the mat.',
             'The dog chased the cat.',
             'CS224n at Stanford is the best NLP class you can ever take!']

frequency(sentences)

array([27.,  2.,  1.,  1.,  1.,  0.,  0.,  0.,  1.,  1.,  0.,  0.,  0.,
        0.,  0.,  0.,  3.,  0.,  1.,  1.,  0.,  1.,  1.,  1.,  1.,  1.])

### 4- TF-IDF
Data una lista de textos, devolver una matriz con la representacion TFIDF

In [30]:
# Based on https://hackernoon.com/document-term-matrix-in-nlp-count-and-tf-idf-scores-explained
from sklearn.feature_extraction.text import TfidfVectorizer

text = ["You don’t want to waste your time. If you’re going to put aside the time and energy needed to learn new programming languages, you want to make sure, without a doubt, that the ones you choose are the most in-demand programming languages on the market. "]

vectorizer = TfidfVectorizer(stop_words='english', smooth_idf=True)

input_matrix = vectorizer.fit_transform(text).todense()
print(input_matrix)

[[0.1796053 0.1796053 0.1796053 0.1796053 0.1796053 0.1796053 0.1796053
  0.3592106 0.1796053 0.1796053 0.1796053 0.1796053 0.1796053 0.1796053
  0.3592106 0.1796053 0.3592106 0.3592106 0.1796053]]


In [90]:
# Taken from ChatGTP
# Prompt : TF-IDF numpy - python

import numpy as np
from collections import Counter

# List of documents
documents = [
   "You don’t want to waste your time. If you’re going to put aside the time and energy needed to learn new programming languages, you want to make sure, without a doubt, that the ones you choose are the most in-demand programming languages on the market. "
]

# Tokenization and preprocessing
def preprocess(text):
    text = text.lower()  # Convert to lowercase
    words = text.split()  # Split text into words
    return words

tokenized_documents = [preprocess(doc) for doc in documents]

# Calculate Term Frequency (TF)
def calculate_tf(document):
    word_counts = Counter(document)
    total_words = len(document)
    tf = {word: count / total_words for word, count in word_counts.items()}
    return tf

# Calculate Document Frequency (DF) for each word
df = {}
for document in tokenized_documents:
    for word in set(document):
        df[word] = df.get(word, 0) + 1

# Calculate Inverse Document Frequency (IDF)
num_documents = len(tokenized_documents)
idf = {word: np.log(num_documents / (df + 1)) for word, df in df.items()}

# Create a matrix to store TF-IDF values
tfidf_matrix = np.zeros((len(documents), len(idf)))

# Calculate TF-IDF values for each document
for i, document in enumerate(tokenized_documents):
    tf = calculate_tf(document)
    for j, word in enumerate(idf.keys()):
        tfidf_matrix[i, j] = tf.get(word, 0) * idf[word]

# Print the TF-IDF matrix
print(tfidf_matrix)


[[-0.01540327 -0.01540327 -0.01540327 -0.01540327 -0.01540327 -0.01540327
  -0.01540327 -0.01540327 -0.06161308 -0.01540327 -0.01540327 -0.01540327
  -0.01540327 -0.01540327 -0.03080654 -0.01540327 -0.01540327 -0.01540327
  -0.03080654 -0.06161308 -0.01540327 -0.01540327 -0.01540327 -0.01540327
  -0.01540327 -0.04620981 -0.01540327 -0.01540327 -0.01540327 -0.01540327
  -0.01540327 -0.01540327 -0.01540327 -0.01540327 -0.01540327]]


### 5 - Comparación de documentos
Realizar una funcion que reciba el corpus y el índice de un documento y devuelva los documentos ordenados por la similitud coseno

In [44]:
# Import libraries
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

document_a='The benefits of regular exercise for maintaining good health.'
document_b= 'Tips for a healthy diet and nutrition.'
document_c= 'The importance of proper sleep for overall well-being.'

# Create the documen term matrix
corpus = [document_a, document_b, document_c]
count_vectorizer = CountVectorizer(stop_words='english')
count_vectorizer = CountVectorizer()

# Convert the matrix to dataframe
sparse_matrix = count_vectorizer.fit_transform(corpus)
doc_term_matrix = sparse_matrix.todense()
df = pd.DataFrame(doc_term_matrix, 
                  columns=count_vectorizer.get_feature_names_out(), 
                  index=['document_a', 'docuement_b', 'document_'])

df

# Computing cosene similarity
print(cosine_similarity(df, df))


[[1.         0.13608276 0.33333333]
 [0.13608276 1.         0.13608276]
 [0.33333333 0.13608276 1.        ]]


In [95]:
# Cosine similarity
# Taken from Chat GPT
import math
from collections import Counter

# Function to calculate the dot product of two vectors
def dot_product(vector1, vector2):
    return sum(x * y for x, y in zip(vector1, vector2))

# Function to calculate the magnitude (Euclidean norm) of a vector
def magnitude(vector):
    return math.sqrt(sum(x ** 2 for x in vector))

# Function to calculate cosine similarity
def cosine_similarity(vector1, vector2):
    dot = dot_product(vector1, vector2)
    mag1 = magnitude(vector1)
    mag2 = magnitude(vector2)
    if mag1 == 0 or mag2 == 0:
        return 0  # Avoid division by zero
    return dot / (mag1 * mag2)

# Corpus of documents (represented as lists of words)

document_a='The benefits of regular exercise for maintaining good health.'
document_b= 'Tips for a healthy diet and nutrition.'
document_c= 'The importance of proper sleep for overall well-being.'

# Create the documen term matrix
corpus = [document_a, document_b, document_c]

# Calculate the TF (Term Frequency) vector for each document in the corpus
tf_vectors = []
for doc in corpus:
    word_count = Counter(doc)
    tf_vector = [word_count[word] / len(doc) for word in doc]
    tf_vectors.append(tf_vector)

# Calculate the IDF (Inverse Document Frequency) vector for the corpus
idf_vector = []
for word in corpus[0]:
    doc_count = sum(1 for doc in corpus if word in doc)
    idf = math.log(len(corpus) / (1 + doc_count))
    idf_vector.append(idf)

# Calculate the TF-IDF vector for each document in the corpus
tfidf_vectors = []
for tf_vector in tf_vectors:
    tfidf_vector = [tf * idf for tf, idf in zip(tf_vector, idf_vector)]
    tfidf_vectors.append(tfidf_vector)

# Calculate the cosine similarity between all pairs of documents
cosine_similarities = []
for i in range(len(tfidf_vectors)):
    similarity_row = []
    for j in range(len(tfidf_vectors)):
        similarity = cosine_similarity(tfidf_vectors[i], tfidf_vectors[j])
        similarity_row.append(similarity)
    cosine_similarities.append(similarity_row)

# Print the cosine similarity matrix
for row in cosine_similarities:
    print(row)

[1.0, 0.6251670018737686, 0.7606283051410277]
[0.6251670018737686, 1.0, 0.6969603770188317]
[0.7606283051410277, 0.6969603770188317, 1.0]
