# Import library

In [1]:
import re
import nltk
import numpy as np
import warnings
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import pipeline

# Suppress warnings
warnings.filterwarnings("ignore", message="The secret `HF_TOKEN` does not exist in your Colab secrets.")

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
# Initialize the text generation pipeline with the desired model
generator = pipeline('text-generation', model='gpt2')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


# Preprocesing Data

In [3]:
def clean_text(text):
    # Remove non-alphabetic characters
    text = re.sub(r"[^a-zA-Z\s]", "", text)

    # Normalize text
    text = text.lower()

    # Tokenize text
    words = text.split()

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]

    # Remove single characters
    words = [word for word in words if len(word) > 1]

    # Remove stopwords
    words = [word for word in words if word not in stopwords.words("english")]

    unique_words = set(words)
    return ' '.join(unique_words)

# TF-IDF using sklearn

In [4]:
def compute_tfidf_sklearn(documents):
    # Initialize TF-IDF vectorizer
    tfidf_vectorizer = TfidfVectorizer()

    # Compute TF-IDF matrix
    tfidf_matrix = tfidf_vectorizer.fit_transform(documents)

    # Get normalized TF-IDF matrix
    tfidf_normalized_matrix = tfidf_matrix.toarray() / np.linalg.norm(tfidf_matrix.toarray(), axis=1, ord=2, keepdims=True)

    return tfidf_normalized_matrix

# TF-IDF from scratch

In [5]:
def compute_tfidf_from_scratch(documents):
    # Preprocess the documents and get unique words
    processed_documents = [clean_text(doc).split() for doc in documents]
    unique_words = set(word for doc in processed_documents for word in doc)

    # Calculate TF for each word in all documents
    tf_matrix_scratch = np.zeros((len(documents), len(unique_words)))

    for i, doc in enumerate(processed_documents):
        word_counts = {word: doc.count(word) for word in doc}
        for j, word in enumerate(unique_words):
            tf_matrix_scratch[i, j] = np.log(1 + word_counts.get(word, 0))

    # Calculate IDF for each word
    idf_matrix_scratch = np.zeros(len(unique_words))

    for i, word in enumerate(unique_words):
        num_docs_containing_word = sum(1 for doc in processed_documents if word in doc)
        idf_matrix_scratch[i] = np.log(len(documents) / (1 + num_docs_containing_word))

    # Calculate TF-IDF for each word
    tfidf_matrix_scratch = tf_matrix_scratch * idf_matrix_scratch

    # Normalize TF-IDF for each document
    normalized_tfidf_matrix_scratch = tfidf_matrix_scratch / np.linalg.norm(tfidf_matrix_scratch, axis=1, keepdims=True)

    return normalized_tfidf_matrix_scratch

#Generate and clean Text

In [6]:
def generate_and_clean_text(prompt):
    generated_text = generator(prompt, max_length=500,  num_return_sequences=1)[0]['generated_text']
    cleaned_text = clean_text(generated_text)
    return cleaned_text

# Unique words for each documnet

In [7]:
# Generate and clean text for each prompt
prompt1 = "Machine learning"
prompt2 = "Football is a good sport"
prompt3 = "Medications are used...."

cleaned_text1 = generate_and_clean_text(prompt1)
cleaned_text2 = generate_and_clean_text(prompt2)
cleaned_text3 = generate_and_clean_text(prompt3)

# print the unique words for each document and number of unique words in each document
print("Unique Words:")
print(f"Document 1: {cleaned_text1}")
print(f"Number of Unique Words in Document 1: {len(cleaned_text1.split())}")
print("-" * 50)
print(f"Document 2: {cleaned_text2}")
print(f"Number of Unique Words in Document 2: {len(cleaned_text2.split())}")
print("-" * 50)
print(f"Document 3: {cleaned_text3}")
print(f"Number of Unique Words in Document 3: {len(cleaned_text3.split())}")

# Create a list of documents
documents = [cleaned_text1, cleaned_text2, cleaned_text3]

# Compute TF-IDF matrix using sklearn
tfidf_matrix_sklearn = compute_tfidf_sklearn(documents)

# Compute TF-IDF matrix from scratch
tfidf_matrix_scratch = compute_tfidf_from_scratch(documents)

# Compute cosine similarity using sklearn TF-IDF matrix
cosine_similarity_sklearn = cosine_similarity(tfidf_matrix_sklearn)

# Compute cosine similarity using TF-IDF matrix from scratch
cosine_similarity_scratch = cosine_similarity(tfidf_matrix_scratch)


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Unique Words:
Document 1: declaration going ha good set course normal compiler structure build would taken learning join really create realize machine extend feature written say everything simple signature give system definition use point per look century around make couple finally first programmer data started diagram new framework object member method regular difference result way process fact number doe convert important pretty seen immutable people thing string language library building different well python basic creating help dont clean two readable note small get let take class perfectly define expression argument provided created want know background base added return next example show top like converted called powerful concise similar im current type quite ccli objectivec program variable evaluate mentioned work code made function also probably may easy
Number of Unique Words in Document 1: 122
--------------------------------------------------
Document 2: fan going always clu

# Compare outputs

In [8]:
# Output cosine similarity matrices
print("Cosine Similarity (Sklearn):")
print(cosine_similarity_sklearn)
print("\n")

print("Cosine Similarity (From Scratch):")
print(cosine_similarity_scratch)

# Error between the two cosine similarity matrices
error = np.mean(np.abs(cosine_similarity_sklearn - cosine_similarity_scratch))
print("Mean Absolute Error between Sklearn and From Scratch:", error)

Cosine Similarity (Sklearn):
[[1.         0.11650957 0.0715767 ]
 [0.11650957 1.         0.0307181 ]
 [0.0715767  0.0307181  1.        ]]


Cosine Similarity (From Scratch):
[[1.         0.02530586 0.02693987]
 [0.02530586 1.         0.02424565]
 [0.02693987 0.02424565 1.        ]]
Mean Absolute Error between Sklearn and From Scratch: 0.03162510644422607
