 # Analyze word frequency and distributions.
   #    •    Use Markov models or GPT-based libraries to generate text.

In [53]:
import nltk
import random
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [48]:
d0 ='This is a simple example, which is used in this lab'
d1 = ' This example shows term frequency'
d2 ='This example also shows text generation'
string = [d0, d1, d2]

In [49]:
for i in string:
  tokens = nltk.word_tokenize(i.lower())
  fdist = nltk.FreqDist(tokens)
  print(dict(fdist))

{'this': 2, 'is': 2, 'a': 1, 'simple': 1, 'example': 1, ',': 1, 'which': 1, 'used': 1, 'in': 1, 'lab': 1}
{'this': 1, 'example': 1, 'shows': 1, 'term': 1, 'frequency': 1}
{'this': 1, 'example': 1, 'also': 1, 'shows': 1, 'text': 1, 'generation': 1}


In [50]:
tfidf = TfidfVectorizer()
result = tfidf.fit_transform(string)

In [51]:
for word, index in tfidf.vocabulary_.items():
    print(f"{word}: {result[0, index]:.4f}, {result[1, index]:.4f}, {result[2, index]:.4f}")


this: 0.3604, 0.3263, 0.2856
is: 0.6102, 0.0000, 0.0000
simple: 0.3051, 0.0000, 0.0000
example: 0.1802, 0.3263, 0.2856
which: 0.3051, 0.0000, 0.0000
used: 0.3051, 0.0000, 0.0000
in: 0.3051, 0.0000, 0.0000
lab: 0.3051, 0.0000, 0.0000
shows: 0.0000, 0.4202, 0.3678
term: 0.0000, 0.5525, 0.0000
frequency: 0.0000, 0.5525, 0.0000
also: 0.0000, 0.0000, 0.4836
text: 0.0000, 0.0000, 0.4836
generation: 0.0000, 0.0000, 0.4836


#   Use Markov models or GPT-based libraries to generate text.

In [63]:

from nltk.util import ngrams


def generate_markov_text(text, size=20):
    words = word_tokenize(text)
    if len(words) < 2:
        return "Input text is too short to generate Markov text."

    markov_chain = {}

    # Create n-grams (bigrams in this case)
    for w1, w2 in ngrams(words, 2):
        if w1 in markov_chain:
            markov_chain[w1].append(w2)
        else:
            markov_chain[w1] = [w2]

    # Start with a random word
    word = random.choice(words)
    result = [word]

    for _ in range(size - 1):
        next_words = markov_chain.get(word, [])
        if next_words:
            word = random.choice(next_words)
            result.append(word)
        else:
            break  # Stop if no next word is found

    return ' '.join(result)

# Example usage
text = "This is a simple example of a Markov chain. A Markov chain is a stochastic model describing a sequence of possible events."
print("\nGenerated Text:")
print(generate_markov_text(text))



Generated Text:
of a stochastic model describing a stochastic model describing a stochastic model describing a stochastic model describing a sequence of
