<a href="https://colab.research.google.com/github/Dharshini-22112004/GEN-AI-INTERN-TASK-2/blob/main/INTERN_TASKS_2ipynb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**TEXT GENERATION WITH MARKOV CHAINS**


In [1]:
import random
import re
from collections import defaultdict, Counter

In [2]:
# Sample text corpus
sample_text = """
In the realm of computer science, algorithms are the essence of problem-solving.
They provide a step-by-step procedure to perform calculations, data processing,
and automated reasoning tasks. Understanding algorithms is fundamental to the
study of computer science and programming.
"""


In [3]:
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [4]:
processed_text = preprocess_text(sample_text)
print(processed_text)

in the realm of computer science algorithms are the essence of problemsolving they provide a stepbystep procedure to perform calculations data processing and automated reasoning tasks understanding algorithms is fundamental to the study of computer science and programming


In [5]:
def build_markov_chain(text, n=2):
    """
    Builds an n-gram Markov chain from the given text.

    Parameters:
        text (str): Preprocessed text string.
        n (int): The size of n-gram. Defaults to 2 (bigrams).

    Returns:
        dict: A dictionary representing the Markov chain.
    """
    words = text.split()
    markov_chain = defaultdict(Counter)

    # Loop through the words and build the chain
    for i in range(len(words) - n):
        # Get the current state (as a tuple of n words)
        current_state = tuple(words[i:i+n])
        # Get the next word
        next_word = words[i + n]
        # Update the chain
        markov_chain[current_state][next_word] += 1

    # Convert counts to probabilities
    for current_state, transitions in markov_chain.items():
        total = sum(transitions.values())
        for word in transitions:
            transitions[word] /= total

    return markov_chain


In [6]:
# Build a bigram (n=2) Markov chain
n = 2
markov_chain = build_markov_chain(processed_text, n)

In [7]:
# Example of inspecting the chain
for state, transitions in markov_chain.items():
    print(f"{state} -> {dict(transitions)}")

('in', 'the') -> {'realm': 1.0}
('the', 'realm') -> {'of': 1.0}
('realm', 'of') -> {'computer': 1.0}
('of', 'computer') -> {'science': 1.0}
('computer', 'science') -> {'algorithms': 0.5, 'and': 0.5}
('science', 'algorithms') -> {'are': 1.0}
('algorithms', 'are') -> {'the': 1.0}
('are', 'the') -> {'essence': 1.0}
('the', 'essence') -> {'of': 1.0}
('essence', 'of') -> {'problemsolving': 1.0}
('of', 'problemsolving') -> {'they': 1.0}
('problemsolving', 'they') -> {'provide': 1.0}
('they', 'provide') -> {'a': 1.0}
('provide', 'a') -> {'stepbystep': 1.0}
('a', 'stepbystep') -> {'procedure': 1.0}
('stepbystep', 'procedure') -> {'to': 1.0}
('procedure', 'to') -> {'perform': 1.0}
('to', 'perform') -> {'calculations': 1.0}
('perform', 'calculations') -> {'data': 1.0}
('calculations', 'data') -> {'processing': 1.0}
('data', 'processing') -> {'and': 1.0}
('processing', 'and') -> {'automated': 1.0}
('and', 'automated') -> {'reasoning': 1.0}
('automated', 'reasoning') -> {'tasks': 1.0}
('reasoning'

In [8]:
def generate_text(markov_chain, n=2, max_length=50, seed=None):
    """
    Generates text using the provided Markov chain.

    Parameters:
        markov_chain (dict): The Markov chain model.
        n (int): The size of n-gram used in the model.
        max_length (int): Maximum number of words to generate.
        seed (tuple): Optional starting state.

    Returns:
        str: Generated text string.
    """
    if seed is None:
        # Randomly choose a starting state
        seed = random.choice(list(markov_chain.keys()))
    elif seed not in markov_chain:
        raise ValueError("The provided seed is not in the Markov chain.")

    output_words = list(seed)

    for _ in range(max_length - n):
        current_state = tuple(output_words[-n:])
        possible_next_words = markov_chain.get(current_state)

        if not possible_next_words:
            # If no possible transitions, end the generation
            break

        next_words = list(possible_next_words.keys())
        probabilities = list(possible_next_words.values())
        next_word = random.choices(next_words, weights=probabilities)[0]
        output_words.append(next_word)

    return ' '.join(output_words)


In [9]:
# Generate text without specifying a seed
generated_text = generate_text(markov_chain, n=2, max_length=30)
print("Generated Text:\n", generated_text)

Generated Text:
 data processing and automated reasoning tasks understanding algorithms is fundamental to the study of computer science algorithms are the essence of problemsolving they provide a stepbystep procedure to perform calculations


In [10]:
# Specify a seed for generation
seed = ('understanding', 'algorithms')
generated_text_with_seed = generate_text(markov_chain, n=2, max_length=20, seed=seed)
print("Generated Text with Seed:\n", generated_text_with_seed)

Generated Text with Seed:
 understanding algorithms is fundamental to the study of computer science algorithms are the essence of problemsolving they provide a stepbystep


In [11]:
import random
import re
from collections import defaultdict, Counter

def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def build_markov_chain(text, n=2):
    words = text.split()
    markov_chain = defaultdict(Counter)

    # Build the chain
    for i in range(len(words) - n):
        current_state = tuple(words[i:i+n])
        next_word = words[i + n]
        markov_chain[current_state][next_word] += 1

    # Convert counts to probabilities
    for current_state, transitions in markov_chain.items():
        total = sum(transitions.values())
        for word in transitions:
            transitions[word] /= total

    return markov_chain

def generate_text(markov_chain, n=2, max_length=50, seed=None):
    if seed is None:
        seed = random.choice(list(markov_chain.keys()))
    elif seed not in markov_chain:
        raise ValueError("The provided seed is not in the Markov chain.")

    output_words = list(seed)

    for _ in range(max_length - n):
        current_state = tuple(output_words[-n:])
        possible_next_words = markov_chain.get(current_state)

        if not possible_next_words:
            break

        next_words = list(possible_next_words.keys())
        probabilities = list(possible_next_words.values())
        next_word = random.choices(next_words, weights=probabilities)[0]
        output_words.append(next_word)

    return ' '.join(output_words)

# Sample text corpus
sample_text = """
In the realm of computer science, algorithms are the essence of problem-solving.
They provide a step-by-step procedure to perform calculations, data processing,
and automated reasoning tasks. Understanding algorithms is fundamental to the
study of computer science and programming.
"""

# Preprocess the text
processed_text = preprocess_text(sample_text)

# Build the Markov chain
n = 2  # You can change n for different n-grams
markov_chain = build_markov_chain(processed_text, n)

# Generate text
generated_text = generate_text(markov_chain, n, max_length=50)
print("Generated Text:\n", generated_text)


Generated Text:
 fundamental to the study of computer science and programming


In [12]:
# Read text from a file
with open('/content/jane-austen-pride-prejudice.txt', 'r', encoding='utf-8') as file:
    text_data = file.read()

# Preprocess and build the chain
processed_text = preprocess_text(text_data)
markov_chain = build_markov_chain(processed_text, n=3)  # Using trigrams

# Generate text
generated_text = generate_text(markov_chain, n=3, max_length=100)
print("Generated Text:\n", generated_text)


Generated Text:
 though i have still another to add i am no stranger to the particulars of your youngest sisters infamous elopement i know it to be bingley from believing him the kind of girl to do such a thing your mother must have been neglected compared with some families i believe we were but such another man for you if you believed it impossible to be true certainly my dear nobody said there were but such of us as wished to learn never wanted the means we were always encouraged to read and improve himself by such an attention would be
