In [1]:
#import libraries
import numpy as np
import sys
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import normalize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
import nltk
from transformers import pipeline

# Download necessary NLTK datasets
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

# **Document generation**

In [2]:
generator = pipeline('text-generation', model ='EleutherAI/gpt-neo-1.3B')
prompt1 = "Advancements in technology have revolutionized the way humans interact with the world, bringing both benefits and challenges."
output1 = generator(prompt1, max_length=300, do_sample=True, temperature=0.9)
prompt2 = "Social media platforms, while offering connectivity and information sharing, also present significant risks to personal privacy and mental well-being."
output2 = generator(prompt2, max_length=350, do_sample=False, temperature=0.7)
prompt3 = "The impact of the industrial revolution on humanity has been profound, shaping societies and economies in ways both beneficial and detrimental."
output3 = generator(prompt3, max_length=450, do_sample=True, temperature=0.9)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.35k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/5.31G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [3]:
with open('technology.txt', 'w') as f:
    f.write(str(output1))

with open('media.txt', 'w') as f:
    f.write(str(output2))

with open('industrial.txt', 'w') as f:
    f.write(str(output3))

In [4]:
import os
docs_path = "/content"
docs = []
for filename in os.listdir(docs_path):
    if filename.endswith(".txt"):
        with open(os.path.join(docs_path, filename), "r")  as file:
            document = file.read()
            docs.append(document)

# **Preprocessing**

In [5]:
unique_words = []
for document in docs:
 #data cleaning
 document = re.sub(r'[^a-zA-Z\s]', ' ', document)
 #lower case + remove spaces
 document = document.lower().strip()
 #tokenization
 tokens = word_tokenize(document)
 #lemmatization
 lemmatizes = [WordNetLemmatizer().lemmatize(token) for token in tokens]
 #remove stop words
 stop_words = stopwords.words('english')
 filtered_tokens = [token for token in lemmatizes if token not in stop_words and len(token) > 3]
 filter_token = []
 filter_token.extend(filtered_tokens)
 doc_unique_words = set(filter_token)
 doc_unique_string = " ".join(doc_unique_words)
 unique_words.append(doc_unique_string)
print(unique_words)





['look advancement greater university learn environmental california analysis southern published funding make associate search access summary technology advance challenge first public soon civil main trial design help read decision world video clinicaltrials statistic provided need professor nwhat informed reading website disease titled text nthe choice browse mobility mccaw making research available find revolutionized investigator prevention center select sponsor hope link robert download control section clinical interact increased engineering department bringing easy report entertainment result data short next benefit human information statistical communication generated health study discus', 'individual identify proliferation also connectivity privacy party significant social mental well text shared track nthe personal movement information present risk sharing generated third platform often offering medium activity used', 'shoe greater beneficial significant year environmental fert

In [6]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(unique_words)
features = tfidf_vectorizer.get_feature_names_out()

In [7]:

np.set_printoptions(threshold=sys.maxsize)
print(features)
print(tfidf_matrix.toarray())

['access' 'activity' 'advance' 'advancement' 'also' 'analysis' 'apparent'
 'appear' 'aspect' 'associate' 'attempt' 'available' 'awareness' 'balance'
 'became' 'began' 'beneficial' 'benefit' 'benefited' 'better' 'bowl'
 'bring' 'bringing' 'britain' 'broader' 'browse' 'california' 'came'
 'center' 'century' 'challenge' 'change' 'chemical' 'choice' 'civil'
 'clinical' 'clinicaltrials' 'clothing' 'communication' 'connectivity'
 'construction' 'control' 'country' 'creation' 'crisis' 'curb' 'data'
 'decade' 'decision' 'department' 'design' 'detrimental' 'different'
 'direct' 'discus' 'disease' 'distribution' 'diversity' 'download' 'dust'
 'early' 'easy' 'economic' 'economy' 'effect' 'efficiency' 'electricity'
 'energy' 'engineering' 'entertainment' 'environment' 'environmental'
 'europe' 'example' 'fertiliser' 'find' 'first' 'followed' 'form'
 'funding' 'generated' 'good' 'greater' 'greatly' 'growing' 'half'
 'health' 'help' 'hope' 'however' 'human' 'humanity' 'identify' 'impact'
 'improved'

In [8]:
#calculate tf
tf_vectorizer = CountVectorizer()
tf_matrix = tf_vectorizer.fit_transform(unique_words)
print(tf_matrix.toarray())

[[1 0 1 1 0 1 0 0 0 1 0 1 0 0 0 0 0 1 0 0 0 0 1 0 0 1 1 0 1 0 1 0 0 1 1 1
  1 0 1 0 0 1 0 0 0 0 1 0 1 1 1 0 0 0 1 1 0 0 1 0 0 1 0 0 0 0 0 0 1 1 0 1
  0 0 0 1 1 0 0 1 1 0 1 0 0 0 1 1 1 0 1 0 0 0 0 0 1 0 0 1 1 1 0 0 0 1 0 0
  1 1 1 0 1 0 1 1 0 0 0 1 0 0 1 0 0 0 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0
  0 0 0 1 0 0 0 1 1 1 1 1 0 1 1 1 0 0 1 0 1 0 1 0 1 1 0 0 0 0 0 0 1 0 0 0
  0 1 1 1 0 1 1 0 1 0 1 0 1 0 0 1 0 0 0 1 0 0 1 0 0 1 0 0 0 1 1 0 0 0 0 1
  0]
 [0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 1 0 0 0 1 0 1 1 0 0 0 0 1 1 0 1 1 0 1 0
  0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 0 1 0 1
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 1 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0
  0]
 [0 0 0 0 1 0 1 1 1 0 1 0 1 1 1 1 1 0 1 1 1 1 0 1 1 0 0 1 0 1 0 1 1 0 0 0
  0 1 0 0 1 0 1 1 1 1 0 1 0 

In [25]:
#calculate idf
import pandas as pd
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
tfidf_transformer.fit(tf_matrix)
df_idf = pd.DataFrame(tfidf_transformer.idf_, index=tf_vectorizer.get_feature_names_out(),columns=["idf_weights"])
idf=tfidf_transformer.idf_
print(df_idf)

             idf_weights
access          1.693147
activity        1.693147
advance         1.693147
advancement     1.693147
also            1.287682
...                  ...
whole           1.693147
wide            1.693147
working         1.693147
world           1.287682
year            1.693147

[217 rows x 1 columns]


In [26]:
#multiply tf and idf
tfidf_matrix2 = np.multiply(tf_matrix.toarray(), idf)
#normalization
tfidf_normalized = normalize(tfidf_matrix2)

In [27]:
print(tfidf_normalized)

[[0.11162767 0.         0.11162767 0.11162767 0.         0.11162767
  0.         0.         0.         0.11162767 0.         0.11162767
  0.         0.         0.         0.         0.         0.11162767
  0.         0.         0.         0.         0.11162767 0.
  0.         0.11162767 0.11162767 0.         0.11162767 0.
  0.11162767 0.         0.         0.11162767 0.11162767 0.11162767
  0.11162767 0.         0.11162767 0.         0.         0.11162767
  0.         0.         0.         0.         0.11162767 0.
  0.11162767 0.11162767 0.11162767 0.         0.         0.
  0.11162767 0.11162767 0.         0.         0.11162767 0.
  0.         0.11162767 0.         0.         0.         0.
  0.         0.         0.11162767 0.11162767 0.         0.08489572
  0.         0.         0.         0.11162767 0.08489572 0.
  0.         0.11162767 0.0659291  0.         0.08489572 0.
  0.         0.         0.11162767 0.11162767 0.11162767 0.
  0.08489572 0.         0.         0.         0.    

In [11]:
#tfidf from scratch
class TFIDF:
    def __init__(self):
        self.idf_ = {}
        self.vocab_ = {}

    def fit_transform(self, documents):
        tf = []
        doc_count = len(documents)

        
        for document in documents:
            doc_tf = {}
            words = document.split()
            for word in words:
                doc_tf[word] = doc_tf.get(word, 0) + 1
            for word in doc_tf:
                doc_tf[word] = doc_tf[word] / len(words)
                self.idf_[word] = self.idf_.get(word, 0) + 1
            tf.append(doc_tf)

        
        sorted_vocab = sorted(self.idf_.keys())
        self.vocab_ = {word: idx for idx, word in enumerate(sorted_vocab)}

        
        for word in self.idf_:
            self.idf_[word] = np.log((1 + doc_count) / (1 + self.idf_[word])) + 1

        
        tfidf = []
        for doc in tf:
            doc_tfidf = np.zeros(len(self.vocab_))
            for word, value in doc.items():
                if word in self.vocab_:
                    index = self.vocab_[word]
                    doc_tfidf[index] = value * self.idf_[word]
            
            norm = np.linalg.norm(doc_tfidf)
            if norm > 0:
                doc_tfidf = doc_tfidf / norm
            tfidf.append(doc_tfidf)

        return np.array(tfidf)


In [12]:
custom_tfidf = TFIDF()
custom_tfidf_matrix = custom_tfidf.fit_transform(unique_words)

In [13]:
print("Custom TF-IDF vs. sklearn TF-IDF")
print("Custom:", custom_tfidf_matrix)
print("sklearn:", tfidf_matrix.toarray())

Custom TF-IDF vs. sklearn TF-IDF
Custom: [[0.11162767 0.         0.11162767 0.11162767 0.         0.11162767
  0.         0.         0.         0.11162767 0.         0.11162767
  0.         0.         0.         0.         0.         0.11162767
  0.         0.         0.         0.         0.11162767 0.
  0.         0.11162767 0.11162767 0.         0.11162767 0.
  0.11162767 0.         0.         0.11162767 0.11162767 0.11162767
  0.11162767 0.         0.11162767 0.         0.         0.11162767
  0.         0.         0.         0.         0.11162767 0.
  0.11162767 0.11162767 0.11162767 0.         0.         0.
  0.11162767 0.11162767 0.         0.         0.11162767 0.
  0.         0.11162767 0.         0.         0.         0.
  0.         0.         0.11162767 0.11162767 0.         0.08489572
  0.         0.         0.         0.11162767 0.08489572 0.
  0.         0.11162767 0.0659291  0.         0.08489572 0.
  0.         0.         0.11162767 0.11162767 0.11162767 0.
  0.0848957