<a href="https://colab.research.google.com/github/2203A51754/IRS/blob/main/IRS_1754_Lab06.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
###MONDAY
import nltk
from nltk.util import ngrams
from collections import Counter, defaultdict
import random

# Ensure you have downloaded the required NLTK data files
nltk.download('punkt')
# Download the 'punkt_tab' data package for word tokenization
nltk.download('punkt_tab') # This line was added to download the necessary tokenizer data.

def preprocess_text(text):
    # Convert to lowercase and tokenize
    tokens = nltk.word_tokenize(text.lower())
    return tokens
def build_trigram_model(tokens):
    model = defaultdict(Counter)
    trigrams = ngrams(tokens, 3)
    for w1, w2, w3 in trigrams:
        model[(w1, w2)][w3] += 1
    return model
def calculate_probability(model, sequence):
    sequence = sequence.lower().split()
    prob = 1.0
    for i in range(len(sequence) - 2):
        w1, w2, w3 = sequence[i], sequence[i+1], sequence[i+2]
        count_w3 = model[(w1, w2)][w3]
        total_count = sum(model[(w1, w2)].values())
        prob *= count_w3 / total_count if total_count > 0 else 0
    return prob
def generate_text(model, seed_words, length):
    text = seed_words.lower().split()
    for _ in range(length):
        w1, w2 = text[-2], text[-1]
        next_words = model.get((w1, w2), None)
        if next_words:
            next_word = random.choices(list(next_words.keys()), weights=next_words.values())[0]
            text.append(next_word)
        else:
            break  # Cannot predict further
    return ' '.join(text)
# Sample text
text = "The quick brown fox jumps over the lazy dog. The quick brown fox is quick."

tokens = preprocess_text(text)
model = build_trigram_model(tokens)

sequence = "the quick brown"
probability = calculate_probability(model, sequence + " fox")
print(f"Probability of '{sequence} fox': {probability}")

generated_text = generate_text(model, "the quick", 5)
print(f"Generated Text: {generated_text}")

Probability of 'the quick brown fox': 1.0
Generated Text: the quick brown fox jumps over the


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [None]:
###TUESDAY
def build_ngram_model(tokens, N):
    model = defaultdict(Counter)
    n_grams = ngrams(tokens, N)
    for grams in n_grams:
        context = grams[:-1]
        word = grams[-1]
        model[context][word] += 1
    return model
def calculate_perplexity(model, tokens, N):
    N = max(N, 1)
    n_grams = list(ngrams(tokens, N))
    perplexity = 1
    N_count = len(n_grams)
    for grams in n_grams:
        context = grams[:-1]
        word = grams[-1]
        total_count = sum(model[context].values())
        word_count = model[context][word]
        probability = word_count / total_count if total_count > 0 else 0
        perplexity *= (1 / probability) if probability > 0 else 1
    perplexity = pow(perplexity, 1 / N_count)
    return perplexity
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

def compare_ngram_models(tokens):
    perplexities = []
    Ns = range(1, 6)
    for N in Ns:
        model = build_ngram_model(tokens, N)
        perp = calculate_perplexity(model, tokens, N)
        perplexities.append(perp)

    df = pd.DataFrame({'N-gram': list(Ns), 'Perplexity': perplexities})
    sns.barplot(x='N-gram', y='Perplexity', data=df)
    plt.title('N-gram Model Perplexity Comparison')
    plt.show()


In [None]:
###WEDNESDAY
import string

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenize
    tokens = nltk.word_tokenize(text)
    return tokens
def generate_ngrams(tokens, N):
    return list(ngrams(tokens, N))

def get_ngram_frequencies(ngrams_list):
    freq_dict = Counter(ngrams_list)
    return freq_dict
tokens = preprocess_text(text)

for N in range(1, 4):
    ngrams_list = generate_ngrams(tokens, N)
    freq_dict = get_ngram_frequencies(ngrams_list)
    print(f"\n{N}-grams Frequencies:")
    for ngram, freq in freq_dict.items():
        print(f"{ngram}: {freq}")



1-grams Frequencies:
('the',): 3
('quick',): 3
('brown',): 2
('fox',): 2
('jumps',): 1
('over',): 1
('lazy',): 1
('dog',): 1
('is',): 1

2-grams Frequencies:
('the', 'quick'): 2
('quick', 'brown'): 2
('brown', 'fox'): 2
('fox', 'jumps'): 1
('jumps', 'over'): 1
('over', 'the'): 1
('the', 'lazy'): 1
('lazy', 'dog'): 1
('dog', 'the'): 1
('fox', 'is'): 1
('is', 'quick'): 1

3-grams Frequencies:
('the', 'quick', 'brown'): 2
('quick', 'brown', 'fox'): 2
('brown', 'fox', 'jumps'): 1
('fox', 'jumps', 'over'): 1
('jumps', 'over', 'the'): 1
('over', 'the', 'lazy'): 1
('the', 'lazy', 'dog'): 1
('lazy', 'dog', 'the'): 1
('dog', 'the', 'quick'): 1
('brown', 'fox', 'is'): 1
('fox', 'is', 'quick'): 1


In [None]:
###THURSDAY
import re

def preprocess_social_media_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    # Replace emojis with words (optional)
    # Handle mentions and hashtags
    text = re.sub(r'@\w+', '', text)  # Remove mentions
    text = re.sub(r'#\w+', '', text)  # Remove hashtags
    # Expand abbreviations (optional)
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenize
    tokens = nltk.word_tokenize(text)
    return tokens
def build_ngram_model_with_smoothing(tokens, N):
    model = defaultdict(Counter)
    n_grams = ngrams(tokens, N)
    vocabulary = set(tokens)
    V = len(vocabulary)
    for grams in n_grams:
        context = grams[:-1]
        word = grams[-1]
        model[context][word] += 1

    # Apply smoothing
    for context in model:
        total_count = sum(model[context].values())
        for word in model[context]:
            model[context][word] = (model[context][word] + 1) / (total_count + V)
    return model
import re
import tokenize
from io import BytesIO
from collections import defaultdict, Counter
from nltk.util import ngrams
import keyword

# Sample code snippet
code_sample = '''
def factorial(n):
    """Compute the factorial of n"""
    if n == 0:
        return 1
    else:
        return n * factorial(n - 1)  # Recursive call
'''

def preprocess_code(code):
    # Remove comments and docstrings
    code_no_comments = re.sub(r'(\"\"\".*?\"\"\"|\'\'\'.*?\'\'\'|#.*$)', '', code, flags=re.MULTILINE | re.DOTALL)

    tokens = []
    try:
        g = tokenize.tokenize(BytesIO(code_no_comments.encode('utf-8')).readline)
        for toknum, tokval, _, _, _ in g:
            if toknum == tokenize.ENDMARKER:
                continue
            elif toknum == tokenize.NUMBER:
                tokens.append('<NUM>')
            elif toknum == tokenize.STRING:
                tokens.append('<STRING>')
            elif toknum == tokenize.NAME:
                if tokval in keyword.kwlist:
                    tokens.append(tokval)
                else:
                    tokens.append('<VAR>')
            else:
                tokens.append(tokval)
    except tokenize.TokenError as e:
        print(f"Tokenization error: {e}")

    return tokens
tokens = preprocess_code(code_sample)
print(tokens)


['utf-8', '\n', 'def', '<VAR>', '(', '<VAR>', ')', ':', '\n', '\n', '    ', 'if', '<VAR>', '==', '<NUM>', ':', '\n', '        ', 'return', '<NUM>', '\n', '', 'else', ':', '\n', '        ', 'return', '<VAR>', '*', '<VAR>', '(', '<VAR>', '-', '<NUM>', ')', '', '', '']
