In [1]:
import re
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [8]:

# Load pre-trained model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Function to generate text for a specific field
def generate_document(prompt, max_length=100, temperature=0.9):
    input_ids = tokenizer.encode(prompt, return_tensors="pt")
    output = model.generate(input_ids, max_length=max_length, temperature=temperature)
    return tokenizer.decode(output[0], skip_special_tokens=True)

# Generate documents for different fields
fields = {
    "technology": "Exploring the latest advancements in technology.",
    "healthcare": "Improving healthcare through innovation and research.",
    "artificial intelligence": "Unraveling the mysteries of artificial intelligence.",
}


generated_documents = {}

for field, prompt in fields.items():
    generated_documents[field] = generate_document(prompt)

# Print generated documents
for field, document in generated_documents.items():
    print(f"--- {field.capitalize()} ---")
    print(document)
    print("\n")


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


--- Technology ---
Exploring the latest advancements in technology.

The new technology is called "deep learning," and it's a new way of learning about the world.

Deep learning is a new way of learning about the world.

Deep learning is a new way of learning about the world.

Deep learning is a new way of learning about the world.

Deep learning is a new way of learning about the world.

Deep learning is a new way of learning about the world


--- Healthcare ---
Improving healthcare through innovation and research.

The government is also working to improve the quality of life for people with disabilities.

The government is also working to improve the quality of life for people with disabilities.

The government is also working to improve the quality of life for people with disabilities.

The government is also working to improve the quality of life for people with disabilities.

The government is also working to improve the quality of life for people with disabilities.




--- Artif

In [9]:

def preprocess_text(text):
    #convert to lowercase
    text = text.lower()
    #remove Non alphanumeric characters
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    #tokenization
    tokens = word_tokenize(text)
    #remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    #lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    #tokens into a single string
    preprocessed_text = ' '.join(tokens)

    return preprocessed_text

corpus = []
for field, document in generated_documents.items():
    corpus.append(preprocess_text(document))
print(corpus)

['exploring latest advancement technology new technology called deep learning new way learning world deep learning new way learning world deep learning new way learning world deep learning new way learning world deep learning new way learning world deep learning new way learning world', 'improving healthcare innovation research government also working improve quality life people disability government also working improve quality life people disability government also working improve quality life people disability government also working improve quality life people disability government also working improve quality life people disability', 'unraveling mystery artificial intelligence book mustread anyone want understand world artificial intelligence book mustread anyone want understand world artificial intelligence book mustread anyone want understand world artificial intelligence book mustread anyone want understand world artificial intelligence book must']


In [10]:
def get_unique_words(preprocessed_text):
    words = preprocessed_text.split()
    unique_words = set(words)
    return unique_words

for i in range(len(corpus)):
  unique_words = get_unique_words(corpus[i])
  print(unique_words)


{'latest', 'deep', 'learning', 'new', 'technology', 'exploring', 'way', 'called', 'world', 'advancement'}
{'healthcare', 'life', 'government', 'disability', 'innovation', 'improve', 'improving', 'quality', 'working', 'research', 'also', 'people'}
{'book', 'unraveling', 'want', 'artificial', 'mustread', 'anyone', 'intelligence', 'understand', 'must', 'mystery', 'world'}


**TF-IDF usign sklearn**

TF

In [11]:


tfidf_vectorizer = TfidfVectorizer(use_idf=False)

tf_matrix = tfidf_vectorizer.fit_transform(corpus)

# Get the feature names (terms)
terms = tfidf_vectorizer.get_feature_names_out()

# Print the TF matrix
print("Term Frequency Matrix:")
print(tf_matrix.toarray())

# Print the corresponding terms
print("\nTerms:")
print(terms)


Term Frequency Matrix:
[[0.05688801 0.         0.         0.         0.         0.05688801
  0.34132807 0.         0.05688801 0.         0.         0.
  0.         0.         0.         0.05688801 0.68265615 0.
  0.         0.         0.         0.39821609 0.         0.
  0.         0.11377602 0.         0.         0.         0.34132807
  0.         0.34132807]
 [0.         0.35007002 0.         0.         0.         0.
  0.         0.35007002 0.         0.35007002 0.070014   0.35007002
  0.070014   0.070014   0.         0.         0.         0.35007002
  0.         0.         0.         0.         0.35007002 0.35007002
  0.070014   0.         0.         0.         0.         0.
  0.35007002 0.        ]
 [0.         0.         0.31822291 0.39777864 0.39777864 0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.39777864 0.         0.         0.
  0.07955573 0.31822291 0.07955573 0.         0.         0.
  0.         0.         0.31822291 0.07955573 0

IDF

In [12]:

tr_idf_model  = TfidfVectorizer()
tf_idf_vector = tr_idf_model.fit_transform(corpus)

tr_idf_model.idf_

array([1.69314718, 1.69314718, 1.69314718, 1.69314718, 1.69314718,
       1.69314718, 1.69314718, 1.69314718, 1.69314718, 1.69314718,
       1.69314718, 1.69314718, 1.69314718, 1.69314718, 1.69314718,
       1.69314718, 1.69314718, 1.69314718, 1.69314718, 1.69314718,
       1.69314718, 1.69314718, 1.69314718, 1.69314718, 1.69314718,
       1.69314718, 1.69314718, 1.69314718, 1.69314718, 1.69314718,
       1.69314718, 1.28768207])

TF-IDF

In [13]:
tf_idf_array = tf_idf_vector.toarray()

words_set = tr_idf_model.get_feature_names_out()

df_tf_idf = pd.DataFrame(tf_idf_array, columns = words_set)

df_tf_idf

Unnamed: 0,advancement,also,anyone,artificial,book,called,deep,disability,exploring,government,...,people,quality,research,technology,understand,unraveling,want,way,working,world
0,0.058339,0.0,0.0,0.0,0.0,0.058339,0.350033,0.0,0.058339,0.0,...,0.0,0.0,0.0,0.116678,0.0,0.0,0.0,0.350033,0.0,0.266209
1,0.0,0.35007,0.0,0.0,0.0,0.0,0.0,0.35007,0.0,0.35007,...,0.35007,0.35007,0.070014,0.0,0.0,0.0,0.0,0.0,0.35007,0.0
2,0.0,0.0,0.325242,0.406552,0.406552,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.325242,0.08131,0.325242,0.0,0.0,0.247355


normlization

In [14]:
tfidf_normalized = (df_tf_idf.T / df_tf_idf.sum(axis=1)).T
tfidf_normalized

Unnamed: 0,advancement,also,anyone,artificial,book,called,deep,disability,exploring,government,...,people,quality,research,technology,understand,unraveling,want,way,working,world
0,0.02406,0.0,0.0,0.0,0.0,0.02406,0.144359,0.0,0.02406,0.0,...,0.0,0.0,0.0,0.04812,0.0,0.0,0.0,0.144359,0.0,0.109788
1,0.0,0.113636,0.0,0.0,0.0,0.0,0.0,0.113636,0.0,0.113636,...,0.113636,0.113636,0.022727,0.0,0.0,0.0,0.0,0.0,0.113636,0.0
2,0.0,0.0,0.107985,0.134982,0.134982,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.107985,0.026996,0.107985,0.0,0.0,0.082126


**bouns**

TF = number of times the term appears in a document / total number of words in the document

In [15]:
n_docs = len(corpus)         #·Number of documents in the corpus
n_words_set = len(words_set) #·Number of unique words in the

df_tf = pd.DataFrame(np.zeros((n_docs, n_words_set)), columns=words_set)

# Compute Term Frequency (TF)
for i in range(n_docs):
    words = corpus[i].split(' ') # Words in the document
    for w in words:
        df_tf[w][i] = df_tf[w][i] + (1 / len(words))

df_tf

Unnamed: 0,advancement,also,anyone,artificial,book,called,deep,disability,exploring,government,...,people,quality,research,technology,understand,unraveling,want,way,working,world
0,0.023256,0.0,0.0,0.0,0.0,0.023256,0.139535,0.0,0.023256,0.0,...,0.0,0.0,0.0,0.046512,0.0,0.0,0.0,0.139535,0.0,0.139535
1,0.0,0.113636,0.0,0.0,0.0,0.0,0.0,0.113636,0.0,0.113636,...,0.113636,0.113636,0.022727,0.0,0.0,0.0,0.0,0.0,0.113636,0.0
2,0.0,0.0,0.105263,0.131579,0.131579,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.105263,0.026316,0.105263,0.0,0.0,0.105263


IDF = log(total number of doc / number of doc contain the term)

In [16]:
idf = {}

for w in words_set:
    k = 0    # number of documents in the corpus that contain this word

    for i in range(n_docs):
        if w in corpus[i].split():
            k += 1

    idf[w] =  np.log10(n_docs / k)

    print(f'{w}: {idf[w]}' )

advancement: 0.47712125471966244
also: 0.47712125471966244
anyone: 0.47712125471966244
artificial: 0.47712125471966244
book: 0.47712125471966244
called: 0.47712125471966244
deep: 0.47712125471966244
disability: 0.47712125471966244
exploring: 0.47712125471966244
government: 0.47712125471966244
healthcare: 0.47712125471966244
improve: 0.47712125471966244
improving: 0.47712125471966244
innovation: 0.47712125471966244
intelligence: 0.47712125471966244
latest: 0.47712125471966244
learning: 0.47712125471966244
life: 0.47712125471966244
must: 0.47712125471966244
mustread: 0.47712125471966244
mystery: 0.47712125471966244
new: 0.47712125471966244
people: 0.47712125471966244
quality: 0.47712125471966244
research: 0.47712125471966244
technology: 0.47712125471966244
understand: 0.47712125471966244
unraveling: 0.47712125471966244
want: 0.47712125471966244
way: 0.47712125471966244
working: 0.47712125471966244
world: 0.17609125905568124


TF-IDF

In [17]:
df_tf_idf = df_tf.copy()

for w in words_set:
    for i in range(n_docs):
        df_tf_idf[w][i] = df_tf[w][i] * idf[w]

df_tf_idf

Unnamed: 0,advancement,also,anyone,artificial,book,called,deep,disability,exploring,government,...,people,quality,research,technology,understand,unraveling,want,way,working,world
0,0.011096,0.0,0.0,0.0,0.0,0.011096,0.066575,0.0,0.011096,0.0,...,0.0,0.0,0.0,0.022192,0.0,0.0,0.0,0.066575,0.0,0.024571
1,0.0,0.054218,0.0,0.0,0.0,0.0,0.0,0.054218,0.0,0.054218,...,0.054218,0.054218,0.010844,0.0,0.0,0.0,0.0,0.0,0.054218,0.0
2,0.0,0.0,0.050223,0.062779,0.062779,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.050223,0.012556,0.050223,0.0,0.0,0.018536


normalization

In [18]:
tfidf_normalized = (df_tf_idf.T / df_tf_idf.sum(axis=1)).T
tfidf_normalized

Unnamed: 0,advancement,also,anyone,artificial,book,called,deep,disability,exploring,government,...,people,quality,research,technology,understand,unraveling,want,way,working,world
0,0.025501,0.0,0.0,0.0,0.0,0.025501,0.153005,0.0,0.025501,0.0,...,0.0,0.0,0.0,0.051002,0.0,0.0,0.0,0.153005,0.0,0.05647
1,0.0,0.113636,0.0,0.0,0.0,0.0,0.0,0.113636,0.0,0.113636,...,0.113636,0.113636,0.022727,0.0,0.0,0.0,0.0,0.0,0.113636,0.0
2,0.0,0.0,0.112751,0.140939,0.140939,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.112751,0.028188,0.112751,0.0,0.0,0.041613
