## Assignment 1

In [9]:
import random
from collections import defaultdict
from scipy.sparse import dok_matrix

def build_markov_chain(text, k=2):
   
    words = text.split()
    index = {word: i for i, word in enumerate(set(words))}
    matrix = dok_matrix((len(index), len(index)), dtype=float)

    for i in range(len(words) - k + 1):
        word_sequence = words[i:i + k]
        if len(word_sequence) == k:
            matrix[index[word_sequence[0]], index[word_sequence[1]]] += 1

    # Normalizing the matrix
    for i, word in enumerate(index):
        sum_of_row = sum(matrix[i, j] for j in range(len(index)))
        if sum_of_row > 0:
            matrix[i, :] = matrix[i, :] / sum_of_row

    return matrix, index

def generate_text(chain, index, length=50):

    word_list = list(index.keys())
    current_word = random.choice(word_list)
    text = [current_word]

    for _ in range(length - 1):
        current_index = index[current_word]
        probabilities = chain[current_index, :].toarray().flatten()
        
        # Check if the sum of probabilities is zero
        if probabilities.sum() == 0:
            current_word = random.choice(word_list)
        else:
            next_word = random.choices(word_list, weights=probabilities)[0]
            text.append(next_word)
            current_word = next_word

    return ' '.join(text)

#### Short test

In [15]:
small_text = "It was a bright cold day in April,and the clocks were striking thirteen."
markov_chain, word_index = build_markov_chain(small_text)
generated_text = generate_text(markov_chain, word_index)
generated_text

'a bright cold day in April,and the clocks were striking thirteen. striking thirteen. striking thirteen. cold day in April,and the clocks were striking thirteen. was a bright cold day in April,and the clocks were striking thirteen. in April,and the clocks were striking thirteen. the'

#### Long test

In [None]:
def read_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

file_path = '1984.txt'
text = read_file(file_path)

markov_chain, word_index = build_markov_chain(text)
generated_text = generate_text(markov_chain, word_index, length=100)
generated_text