In [None]:
import os
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from transformers import pipeline
import numpy as np
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
import nltk

# Download necessary NLTK datasets
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')




In [None]:

# Preprocessing function
def preprocess_document(document):
    document = document.lower()
    document = re.sub(r'[^a-zA-Z\s]', ' ', document)
    tokens = word_tokenize(document)
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stopwords.words('english')]
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(filtered_tokens)


In [None]:
class CustomTFIDF:
    def __init__(self):
        self.idf_ = {}
        self.vocab_ = {}

    def fit_transform(self, documents):
        tf = []
        doc_count = len(documents)

        # Compute term frequencies and document frequencies for IDF
        for document in documents:
            doc_tf = {}
            words = document.split()
            for word in words:
                doc_tf[word] = doc_tf.get(word, 0) + 1
            for word in doc_tf:
                doc_tf[word] = doc_tf[word] / len(words)
                self.idf_[word] = self.idf_.get(word, 0) + 1
            tf.append(doc_tf)

        # Sort the vocabulary alphabetically and assign indices
        sorted_vocab = sorted(self.idf_.keys())
        self.vocab_ = {word: idx for idx, word in enumerate(sorted_vocab)}

        # Compute IDF using the sorted vocabulary
        for word in self.idf_:
            self.idf_[word] = np.log((1 + doc_count) / (1 + self.idf_[word])) + 1

        # Compute TF-IDF scores using the sorted vocabulary
        tfidf = []
        for doc in tf:
            doc_tfidf = np.zeros(len(self.vocab_))
            for word, value in doc.items():
                if word in self.vocab_:
                    index = self.vocab_[word]
                    doc_tfidf[index] = value * self.idf_[word]
            # L2 Normalization
            norm = np.linalg.norm(doc_tfidf)
            if norm > 0:
                doc_tfidf = doc_tfidf / norm
            tfidf.append(doc_tfidf)

        return np.array(tfidf)

In [None]:

# Initialize pipeline for text generation
generator = pipeline('text-generation', model ='EleutherAI/gpt-neo-1.3B')

# Define prompts for text generation
prompts = [
    "The industrial revolution and its consequences have been a disaster for the human race.",
    "Social media is a great inhouse threat for families that could result in long term harm.",
    "The fall of the great umayyad caliphate had negative effects on its territories."
]


In [None]:
# Generate texts based on prompts and save to files
for i, prompt in enumerate(prompts, 1):
    output = generator(prompt, max_length=100, do_sample=True, temperature=0.9)
    with open(f'output_{i}.txt', 'w') as f:
        f.write(str(output))


In [None]:
# Read generated texts from files
docs = []
for filename in os.listdir('.'):
    if filename.startswith("output_") and filename.endswith(".txt"):
        with open(filename, "r")  as file:
            document = file.read()
            document=preprocess_document(document)
            docs.append(document)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Using sklearn for comparison
tfidf_vectorizer = TfidfVectorizer()
sklearn_tfidf_matrix = tfidf_vectorizer.fit_transform(docs).toarray()

In [None]:
# Using CustomTFIDF
custom_tfidf = CustomTFIDF()
custom_tfidf_matrix = custom_tfidf.fit_transform(docs)

In [None]:
# Comparing results (simplified, for detailed comparison, iterate over matrices)
print("Custom TF-IDF vs. sklearn TF-IDF (first document vector):")
print("Custom:", custom_tfidf_matrix[1].reshape(-1,1))

In [None]:
print("sklearn:", sklearn_tfidf_matrix[1].reshape(-1,1))