In [1]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import stopwords
import string
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

In [2]:
# Initialize NLTK resources
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
# Load pre-trained GPT-2 model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [4]:
def generate_text(topic, length=100):
    input_text = "According to Wikipedia, " + topic + " is "
    input_ids = tokenizer.encode(input_text, return_tensors="pt")

    # Generate text
    output = model.generate(input_ids, max_length=length, num_return_sequences=1,
                            early_stopping=True, no_repeat_ngram_size=2,
                            temperature=0.9, top_k=50, top_p=0.95)

    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    return generated_text


In [5]:
import string
from nltk.corpus import stopwords

# Define stop words
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Tokenize text
    tokens = word_tokenize(text)

    # Remove punctuation and stop words
    tokens = [word for word in tokens if word not in string.punctuation and word.lower() not in stop_words]

    # Lemmatization and stemming
    lemmatizer = WordNetLemmatizer()
    stemmer = PorterStemmer()
    tokens = [lemmatizer.lemmatize(stemmer.stem(word)) for word in tokens]

    return ' '.join(tokens)

# TF_IDF (Basic Rules)

In [6]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

def calculate_tfidf_from_scratch_2(documents):
    from sklearn.feature_extraction.text import CountVectorizer
     # Preprocess the documents
    preprocessed_documents = [preprocess_text(doc) for doc in documents]

    # Tokenize text using CountVectorizer tokenizer
    count_vectorizer = CountVectorizer()
    count_matrix = count_vectorizer.fit_transform(preprocessed_documents)  # Use preprocessed documents here

    # Extract feature names
    feature_names = count_vectorizer.get_feature_names_out()

    # Calculate Term Frequency (TF)
    tf_matrix = count_matrix.toarray()



    # Get the total number of words in each document
    total_words_per_document = np.sum(tf_matrix, axis=1, keepdims=True)

    # Normalize TF by the total number of words in each document
    tf_matrix_normalized = tf_matrix / total_words_per_document

    # Calculate Inverse Document Frequency (IDF)
    num_documents = len(documents)
    doc_frequency = np.sum(tf_matrix > 0, axis=0)
    idf_matrix = np.log((1 + num_documents) / (1 + doc_frequency)) + 1

    # Calculate TF-IDF
    tfidf_matrix = tf_matrix * idf_matrix

    # Normalize TF-IDF scores
    tfidf_matrix_normalized = tfidf_matrix / np.linalg.norm(tfidf_matrix, axis=1, keepdims=True)

    # Collect TF-IDF scores for all documents
    tfidf_scores_all = []
    for i in range(len(documents)):
        tfidf_scores = [(feature_names[j], tfidf_matrix[i, j]) for j in range(len(feature_names))]
        tfidf_scores.sort(key=lambda x: x[1], reverse=True)
        tfidf_scores_all.append(tfidf_scores)

    return tf_matrix_normalized, idf_matrix, tfidf_matrix, tfidf_matrix_normalized, tfidf_scores_all, feature_names

#TD_IDF(From Scratch)

In [7]:
def calculate_tfidf_from_scratch(documents):
    from sklearn.feature_extraction.text import CountVectorizer
     # Preprocess the documents
    preprocessed_documents = [preprocess_text(doc) for doc in documents]


    # Tokenize text using CountVectorizer tokenizer
    count_vectorizer = CountVectorizer()
    count_matrix = count_vectorizer.fit_transform(preprocessed_documents)  # Use preprocessed documents here

    # Extract feature names
    feature_names = count_vectorizer.get_feature_names_out()

    # Calculate Term Frequency (TF)
    tf_matrix = count_matrix.toarray()

    # Calculate Inverse Document Frequency (IDF)
    num_documents = len(documents)
    doc_frequency = np.sum(tf_matrix > 0, axis=0)
    idf_matrix = np.log((1 + num_documents) / (1 + doc_frequency)) + 1

    # Calculate TF-IDF
    tfidf_matrix = tf_matrix * idf_matrix

    # Normalize TF-IDF scores
    tfidf_matrix /= np.linalg.norm(tfidf_matrix, axis=1, keepdims=True)

    # Collect TF-IDF scores for all documents
    tfidf_scores_all = []
    for i in range(len(documents)):
        tfidf_scores = [(feature_names[j], tfidf_matrix[i, j]) for j in range(len(feature_names))]
        tfidf_scores.sort(key=lambda x: x[1], reverse=True)
        tfidf_scores_all.append(tfidf_scores)

    return tfidf_scores_all, feature_names

#TD_IDF(Built_in)

In [8]:
def calculate_tfidf_builtin(documents):
    from sklearn.feature_extraction.text import TfidfVectorizer

    # Preprocess the documents
    preprocessed_documents = [preprocess_text(doc) for doc in documents]

    # Calculate TF-IDF scores using scikit-learn with stop words removed
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(preprocessed_documents)
    feature_names = tfidf_vectorizer.get_feature_names_out()
    dense = tfidf_matrix.todense()

    # Collect TF-IDF scores for all documents
    tfidf_scores_all = []
    for i in range(len(documents)):
        doc_tfidf = dense[i].tolist()[0]
        tfidf_scores = [(feature_names[j], doc_tfidf[j]) for j in range(len(feature_names))]
        tfidf_scores.sort(key=lambda x: x[1], reverse=True)
        tfidf_scores_all.append(tfidf_scores)

    return tfidf_scores_all, feature_names

In [9]:
if __name__ == "__main__":
    # Take user input for the topic
    topic = input("Enter a topic: ")

    # Generate text based on the topic
    generated_text = generate_text(topic)

    print("\nGenerated text:")
    print(generated_text)


    # score, name, preprocessed_documents = calculate_tfidf_from_scratch([generated_text])  # Pass document1 as a list
    # print(preprocessed_documents)


    # Take user input for the number of documents
    num_documents = int(input("Enter the number of other documents: "))

    # Collect documents
    documents = []
    for i in range(num_documents):
        document = input(f"Enter document {i+1}: ")
        another_text = generate_text(document)
        print(another_text)
        print("------------------------------------------------------------------------------------------------------------")
        documents.append(another_text)


    print([generated_text]+documents)




    # Calculate TF-IDF scores using custom implementation
    tfidf_scores_scratch, feature_names_scratch = calculate_tfidf_from_scratch([generated_text]+documents)

    # Calculate TF-IDF scores using built-in implementation
    tfidf_scores_builtin, feature_names_builtin = calculate_tfidf_builtin([generated_text]+documents)




    # Check if feature names match
    # Check if feature names match
    assert set(feature_names_scratch) == set(feature_names_builtin)

    # Print TF-IDF scores
    for i in range(len(tfidf_scores_scratch)):
        print(f"Document {i+1} TF-IDF Scores:")
        print("From Scratch:")
        print(tfidf_scores_scratch[i][:10])  # Print top 10 TF-IDF scores from scratch
        print("Built-in:")
        print(tfidf_scores_builtin[i][:10])  # Print top 10 TF-IDF scores from built-in
        print()

    tf, idf, tfidf, tfidf_normalized, tfidf_scores, feature_names = calculate_tfidf_from_scratch_2([generated_text]+documents)

    print("Term Frequency (TF) Matrix:")
    print(tf)
    print("\nInverse Document Frequency (IDF) Matrix:")
    print(idf)
    print("\nTF-IDF Matrix:")
    print(tfidf)
    print("\nNormalized TF-IDF Matrix:")
    print(tfidf_normalized)

Enter a topic: Artificial Intelligence


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



Generated text:
According to Wikipedia, Artificial Intelligence is  a term used to describe the ability to create a computer program that can perform tasks that are not possible in real life.
The term "AI" is used in the same way as "human intelligence" and "machine intelligence".
In the article "The Future of Artificial intelligence", the author states that "the future of AI is not yet clear, but it is likely to be very exciting."
This is a very interesting statement. The future is
Enter the number of other documents: 1
Enter document 1: Machine Learning


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


According to Wikipedia, Machine Learning is  a type of machine learning that is used to predict the future. Machine learning is a type that can be used for many different things. It is also used in many other fields such as machine translation, machine language, and machine intelligence.
Machine Learning
The term "machine learning" is often used as a synonym for machine vision. The term is derived from the Greek word "mosaic", which means "to learn". The word is sometimes
------------------------------------------------------------------------------------------------------------
['According to Wikipedia, Artificial Intelligence is \xa0a term used to describe the ability to create a computer program that can perform tasks that are not possible in real life.\nThe term "AI" is used in the same way as "human intelligence" and "machine intelligence".\nIn the article "The Future of Artificial intelligence", the author states that "the future of AI is not yet clear, but it is likely to be ver