In [169]:
import numpy as np
#import nltk
import math
from nltk.corpus import stopwords

documents = np.array([
    ["El calzado en el ecuador se comercializa por temporada"],
    ["La temporada de calzado de la costa es diferente a la temporada de calzado de la sierra"],
    ["El calzado en ecuador es bueno"]
])


In [170]:
def calculatePreprocessedDocuments(documents):
    #nltk.download('stopwords')
    # Define a list of stopwords in Spanish
    stop_words = set(stopwords.words('spanish'))

    # Preprocess the documents
    preprocessed_documents = []
    for doc in documents:
        # Tokenize the sentence and remove stopwords
        tokens = doc[0].split()
        filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
        # Join the remaining words to reconstruct the preprocessed sentence
        preprocessed_sentence = ' '.join(filtered_tokens)
        preprocessed_documents.append(preprocessed_sentence)
    preprocessed_documents = np.array(preprocessed_documents)
    return preprocessed_documents

In [171]:
def calculateUniqueWords(preprocessed_documents):
    unique_words = []
    for doc in preprocessed_documents:
        tokens = doc.split()
        for token in tokens:
            has_repeat = False
            for unique in unique_words:
                if unique == token:
                    has_repeat = True
            if has_repeat == False:
                unique_words.append(token)

    unique_words = np.array(unique_words)
    return unique_words

In [172]:
def countWordsInString(word, document):
    words = document.split()
    word_count = words.count(word)    
    return word_count

In [173]:
def calculateTk(unique_words,  preprocessed_documents):
    n = unique_words.shape[0]
    m = preprocessed_documents.shape[0]
    tk = np.zeros((n, m))

    for i in range(len(unique_words)):
        sumnk = 0
        unique_word = unique_words[i]
        for j in range(len(preprocessed_documents)):
            document = preprocessed_documents[j]
            document_contains_words = countWordsInString(unique_word, document)
            tk[i][j] = document_contains_words
            sumnk += document_contains_words
    return tk  

In [174]:
def calculateTkMaxFrecuency(tk):
    maxValues = np.zeros(tk.shape[1])
    for i in range(tk.shape[1]):
        max = 0
        for j in range(tk.shape[0]):
            value = tk[j][i]
            if(value > max):
                max = value
        maxValues[i] = max
    return maxValues

In [175]:
def calculateTf(tk):
    tf = np.zeros_like(tk, dtype=np.float64)
    tkMaxFrecuency = calculateTkMaxFrecuency(tk)
    for i in range(tk.shape[0]):
        for j in range(tk.shape[1]):
            tf[i][j] = tk[i][j] / tkMaxFrecuency[j]
    return tf
    

In [176]:
def calculateIDF(tk):
    idf = np.zeros(tk.shape[0], dtype=np.float64)
    N = tk.shape[1]
    for i in range(tk.shape[0]):
        sum = 0
        for j in range(tk.shape[1]):
            sum += tk[i][j]
        idf[i] = math.log(N/sum, 10)
    return idf


In [177]:
def calculateTf_IDF(tf, idf):
    similitudeMatrix = np.zeros_like(tf, dtype=np.float64)
    for i in range(tf.shape[0]):
        for j in range(tf.shape[1]):
            similitudeMatrix[i][j] = tf[i][j] * idf[i]
    return similitudeMatrix

In [180]:
def normalizeTF_IDF(tf_idf):
    normalized_tf_idf = np.zeros_like(tf_idf, dtype=np.float64)
    denominador = np.zeros(tf_idf.shape[1], dtype=np.float64)
    for i in range(tf_idf.shape[1]):
        sum = 0
        for j in range(tf_idf.shape[0]):
            sum += math.pow(tf_idf[j][i],2)
        denominador[i] = math.sqrt(sum)
    for i in range(tf_idf.shape[0]):
        for j in range(tf_idf.shape[1]):
            normalized_tf_idf[i][j] = tf_idf[i][j] / denominador[j]
    return normalized_tf_idf
    

In [181]:
preprocessed_documents = calculatePreprocessedDocuments(documents)
unique_words = calculateUniqueWords(preprocessed_documents)
tk = calculateTk(unique_words, preprocessed_documents)
idf = calculateIDF(tk)
tf = calculateTf(tk)
tf_idf = calculateTf_IDF(tf, idf)
normalized_tf_idf = normalizeTF_IDF(tf_idf)
print(normalized_tf_idf)

[[-0.23856893 -0.28942786 -0.23856893]
 [ 0.33624402  0.          0.33624402]
 [ 0.91105698  0.          0.        ]
 [ 0.          0.          0.        ]
 [ 0.          0.55263958  0.        ]
 [ 0.          0.55263958  0.        ]
 [ 0.          0.55263958  0.        ]
 [ 0.          0.          0.91105698]]


In [182]:
def cosine_similarity(ratings):
    # Calculate the similarity between items using cosine similarity
    item_similarity = np.dot(ratings.T, ratings)  # Transpose the matrix and compute dot product

    # Normalize the item similarity
    item_norms = np.linalg.norm(ratings, axis=0)
    item_similarity = item_similarity/(item_norms[:,np.newaxis]*item_norms)    
    return item_similarity

In [183]:
similitudeMatrix = cosine_similarity(normalized_tf_idf)
print(similitudeMatrix)

[[1.         0.06904849 0.16997518]
 [0.06904849 1.         0.06904849]
 [0.16997518 0.06904849 1.        ]]
