### **Text Generation with Markvo chains**

In [1]:
import numpy as np
import pandas as pd
import requests
import random
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

### **Getting data by hitting the req**

In [2]:
def get_url(url,name):
  !wget {url}
  response = requests.get(url)
  if response.status_code == 200:
    with open(name,'wb') as file:
      file.write(response.content)

In [3]:
get_url("http://www.textfiles.com/stories/timem.hac","Data1")
get_url("http://www.textfiles.com/stories/3gables.txt","Data2")
get_url("http://www.textfiles.com/stories/fgoose.txt", "Data3")
get_url("http://www.textfiles.com/stories/hitch3.txt", "Data4")
get_url("http://www.gutenberg.org/files/1342/1342-0.txt", "Data5")

--2025-05-22 11:35:32--  http://www.textfiles.com/stories/timem.hac
Resolving www.textfiles.com (www.textfiles.com)... 208.86.224.90
Connecting to www.textfiles.com (www.textfiles.com)|208.86.224.90|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 185134 (181K)
Saving to: ‘timem.hac’


2025-05-22 11:35:32 (610 KB/s) - ‘timem.hac’ saved [185134/185134]

--2025-05-22 11:35:33--  http://www.textfiles.com/stories/3gables.txt
Resolving www.textfiles.com (www.textfiles.com)... 208.86.224.90
Connecting to www.textfiles.com (www.textfiles.com)|208.86.224.90|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 33985 (33K) [text/plain]
Saving to: ‘3gables.txt’


2025-05-22 11:35:33 (336 KB/s) - ‘3gables.txt’ saved [33985/33985]

--2025-05-22 11:35:33--  http://www.textfiles.com/stories/fgoose.txt
Resolving www.textfiles.com (www.textfiles.com)... 208.86.224.90
Connecting to www.textfiles.com (www.textfiles.com)|208.86.224.90|:80... connected.
HTTP requ

In [4]:
def get_lines(path):
    txt = []
    with open(path, 'r') as file:
        for line in file:
            line = line.strip()
            if line != '':
                txt.append(line)
    return txt

# Combine all data files
corpus = []
for i in range(1, 6):  # for Data1 to Data4
    file_path = f"/content/Data{i}"
    lines = get_lines(file_path)
    corpus.extend(lines)  # appending the lines to the main corpus

print(f"Total lines in corpus: {len(corpus)}")

Total lines in corpus: 22355


In [5]:
import re

def clean_and_count_words(corpus):
    word_count = 0
    cleaned_text = []
    for line in corpus:
        line = line.strip()
        if line == '----------':
            break
        if line != '':
            # Tokenize using regex
            words = re.findall(r'\b\w+\b', line)
            word_count += len(words)
            cleaned_text.extend(words)

    return cleaned_text  # List of words

# Now pass the previously built corpus
cleaned_text = clean_and_count_words(corpus)
print("Number of words =", len(cleaned_text))

Number of words = 235391


### **Unigram**

In [14]:
from collections import defaultdict

def build_markov_chain(words):
    markov_chain = defaultdict(list)
    for i in range(len(words) - 1):
        curr_word = words[i]
        next_word = words[i + 1]
        markov_chain[curr_word].append(next_word)
    return markov_chain


def generate_unigram_text(chain, length=20):
    word = random.choice(list(chain.keys()))  # random start
    output = [word]

    for _ in range(length - 1):
        next_words = chain.get(word)
        if not next_words:
            break  # no known next word
        word = random.choice(next_words)
        output.append(word)

    return ' '.join(output)

### **Bigram**

In [15]:
def build_bigram_chain(words):
    markov_chain = defaultdict(list)
    for i in range(len(words) - 2):
        key = (words[i], words[i + 1])        # Bigram as key
        next_word = words[i + 2]              # Word that follows
        markov_chain[key].append(next_word)
    return markov_chain

def generate_bigram_text(chain, length=20):
    current_pair = random.choice(list(chain.keys()))  # random bigram (tuple)
    output = [current_pair[0], current_pair[1]]       # start with both words

    for _ in range(length - 2):  # already have 2 words
        next_words = chain.get(current_pair)
        if not next_words:
            break
        next_word = random.choice(next_words)
        output.append(next_word)
        # move the window: (second word of current pair, next word)
        current_pair = (current_pair[1], next_word)

    return ' '.join(output)

### **Trigram**

In [18]:
def build_trigram_chain(words):
    markov_chain = defaultdict(list)
    for i in range(len(words) - 3):
        key = (words[i], words[i + 1], words[i + 2])  # 3-word tuple as key
        next_word = words[i + 3]                       # word that follows
        markov_chain[key].append(next_word)
    return markov_chain

def generate_trigram_text(chain, length=20):
    current_triplet = random.choice(list(chain.keys()))  # random trigram key
    output = [current_triplet[0], current_triplet[1], current_triplet[2]]  # start words

    for _ in range(length - 3):
        next_words = chain.get(current_triplet)
        if not next_words:
            break
        next_word = random.choice(next_words)
        output.append(next_word)
        # Slide the window: drop first word, add next_word
        current_triplet = (current_triplet[1], current_triplet[2], next_word)

    return ' '.join(output)

In [19]:
n = input("Enter 1 for unigram or 2 for Bigram or 3 for Trigram:")

if n == '1':
  chain = build_markov_chain(cleaned_text)
  generated = generate_unigram_text(chain, length=100)
  with open("output.txt", "w", encoding="utf-8") as f:
    f.write(generated)
elif n == '2':
  chain = build_bigram_chain(cleaned_text)
  generated = generate_bigram_text(chain, length=100)
  with open("output.txt", "w", encoding="utf-8") as f:
    f.write(generated)
elif n == '3':
  chain = build_trigram_chain(cleaned_text)
  generated = generate_trigram_text(chain, length=100)
  with open("output.txt", "w", encoding="utf-8") as f:
    f.write(generated)



Enter 1 for unigram or 2 for Bigram or 3 for Trigram:3
