<h1>1. Using the bigram and unigram language models trained on the training data from the
Assignment 1, compute the PMI scores for all the bigrams in the validation and testing
sets created from Assignment 1.

In [None]:
import pickle
import math
import csv
from tqdm import tqdm

def load_ngram_counts(n):
    file_name = f"ngram_counts_{n}gram.pkl"
    try:
        with open(file_name, "rb") as f:
            return pickle.load(f)
    except FileNotFoundError:
        print(f"Error: The file {file_name} was not found.")
        exit()

class PMICalculator:
    def __init__(self, unigram_counts, bigram_counts):
        print("Initializing PMI Calculator...")
        self.unigrams = unigram_counts
        self.bigrams = bigram_counts

        self.total_unigram_tokens = sum(self.unigrams.values())
        self.total_bigram_tokens = sum(self.bigrams.values())

        if self.total_unigram_tokens == 0 or self.total_bigram_tokens == 0:
            raise ValueError("N-gram count files must not be empty.")
        
        print("Initialization complete.")

    def get_pmi(self, w1, w2):
        """
        PMI(w1, w2) = log2( P(w1, w2) / (P(w1) * P(w2)) )
        """
        unigram1 = (w1,)
        unigram2 = (w2,)
        bigram = (w1, w2)

        # Get counts from the training data
        count_w1 = self.unigrams.get(unigram1, 0)
        count_w2 = self.unigrams.get(unigram2, 0)
        count_w1_w2 = self.bigrams.get(bigram, 0)

        # If any word or the bigram itself was unseen in training, PMI is undefined.
        # We represent this with negative infinity.
        if count_w1 == 0 or count_w2 == 0 or count_w1_w2 == 0:
            return -float('inf')

        p_w1 = count_w1 / self.total_unigram_tokens
        p_w2 = count_w2 / self.total_unigram_tokens
        p_w1_w2 = count_w1_w2 / self.total_bigram_tokens

        # PMI
        pmi = math.log2(p_w1_w2 / (p_w1 * p_w2))
        return pmi


def process_file(pmi_calculator, input_filename, output_filename):
    print(f"\nProcessing {input_filename}...")
    with open(input_filename, 'r', encoding='utf-8') as infile, \
         open(output_filename, 'w', encoding='utf-8', newline='') as outfile:
        
        writer = csv.writer(outfile)
        writer.writerow(['word1', 'word2', 'pmi_score'])

        for line in tqdm(infile, desc=f"Calculating PMI for {input_filename}"):
            words = line.strip().split()
            if len(words) < 2:
                continue
            
            # Generate all bigrams from the sentence
            for i in range(len(words) - 1):
                w1, w2 = words[i], words[i+1]
                pmi_score = pmi_calculator.get_pmi(w1, w2)
                writer.writerow([w1, w2, pmi_score])
                
    print(f"PMI scores saved to {output_filename}")

if __name__ == "__main__":
    unigrams = load_ngram_counts(1)
    bigrams = load_ngram_counts(2)

    pmi_calc = PMICalculator(unigrams, bigrams)

    process_file(pmi_calc, "validation.txt", "validation_pmi_scores.csv")
    process_file(pmi_calc, "test.txt", "test_pmi_scores.csv")

Initializing PMI Calculator...
Initialization complete.

Processing validation.txt...


Calculating PMI for validation.txt: 1000it [00:00, 21809.78it/s]


PMI scores saved to validation_pmi_scores.csv

Processing test.txt...


Calculating PMI for test.txt: 1000it [00:00, 24627.03it/s]

PMI scores saved to test_pmi_scores.csv





<h1>Vectorize all the sentences in the training, validation, and testing data that you tokenized
from Assignment-1 using TF-IDF. For the validation and testing data, use the IDF scores
learned from the train data.

In [1]:
# Function to read sentences from a .txt file and tokenize
def read_and_tokenize(filename):
    sentences = []
    with open(filename, 'r', encoding='utf-8') as f:
        for line in f:
            # Strip leading/trailing whitespace and split by space
            tokens = line.strip().split()
            if tokens:  # skip empty lines
                sentences.append(tokens)
    return sentences

# Example usage
train_sentences = read_and_tokenize("/Users/adityakumar/Desktop/college_labs/NLP/assignment 7/train.txt")
validation_sentences = read_and_tokenize("/Users/adityakumar/Desktop/college_labs/NLP/assignment 7/validation.txt")
test_sentences = read_and_tokenize("/Users/adityakumar/Desktop/college_labs/NLP/assignment 7/test.txt")

# Check first few sentences
print("Train:", train_sentences[:2])
print("Validation:", validation_sentences[:2])
print("Test:", test_sentences[:2])

Train: [['તેની', 'વિગત', 'જાણવી', 'આજે', 'પુસ્તુત', 'પણ', 'છે', 'અને', 'અનુકરણીય', 'પણ', 'આ', 'સ્થળે', '૩૫૦૦', 'એકર', 'ઉજજડ', 'જમીનમાં', '૪૦', 'હજાર', 'જેટલાં', 'નિરાશ્રિતો', 'તંબુ', 'બાંધીને', 'રહેતાં', 'હતાં.'], ['જેમા', 'એક', 'ગુટ', 'તો', 'રૃબરૃ', 'રજૂઆત', 'કરવા', 'માટે', 'છેક', 'પાવર', 'પોઈન્ટ', 'ગાંધીનગર', 'સુધી', 'લાંબુ', 'થયુ', 'હતું.']]
Validation: [['વાસ્તવિક', 'અંકુશ', 'રેખા', 'નજીક', 'આવેલા', 'પેન્ગોગ', 'ત્સો', 'એટલે', 'કે', 'સરોવર', 'સુધી', 'પહોંચવું', 'અત્યારે', 'પણ', 'ઘણું', 'મુશ્કેલ', 'છે.'], ['#1.']]
Test: [['આગળ', 'રાજીવે', 'કહ્યું', 'કે', 'મને', 'આ', 'ફિલ્મ', 'કરવાનો', 'આનંદ', 'એટલે', 'થયો', 'કારણ', 'કે', 'હું', 'અમિતાભની', 'પુજા', 'કરૂ', 'છું', 'અને', 'એનાં', 'દાયકાનાં', 'બધા', 'કલાકારોનો', 'આદર', 'કરૂ', 'છું.'], ['અને', 'કરેલાં', 'વ્યવહારો', 'કઈ', 'રીતે', 'દર્શાવવા,', 'જેથી', 'કર', 'અને', 'પેનલ્ટીમાં', 'રાહત', 'મળી', 'શકે', '?']]


In [2]:
from collections import defaultdict
import math

# Build DF (document frequency) from training data
def compute_df(sentences):
    df = defaultdict(int)
    for sent in sentences:
        unique_tokens = set(sent)
        for token in unique_tokens:
            df[token] += 1
    return df

train_df = compute_df(train_sentences)
N_train = len(train_sentences)  # total number of training sentences

# Compute IDF from DF
idf = {}
for word, freq in train_df.items():
    idf[word] = math.log((N_train + 1) / (freq + 1)) + 1  # smoothed IDF

<h1>tf-idf for sentences

In [3]:
def compute_tf_idf(sentences, idf_dict):
    tfidf_vectors = []
    for sent in sentences:
        tf = defaultdict(int)
        for token in sent:
            tf[token] += 1
        # Normalize TF by sentence length
        tf_normalized = {word: count / len(sent) for word, count in tf.items()}
        # Compute TF-IDF
        tfidf = {word: tf_normalized[word] * idf_dict.get(word, math.log((N_train+1)/1)+1)
                 for word in tf_normalized}  # unknown words get minimal IDF
        tfidf_vectors.append(tfidf)
    return tfidf_vectors

In [4]:
train_tfidf = compute_tf_idf(train_sentences, idf)
validation_tfidf = compute_tf_idf(validation_sentences, idf)
test_tfidf = compute_tf_idf(test_sentences, idf)

# Example: print TF-IDF of first training sentence
print(train_tfidf[0])

{'તેની': 0.2081278205151952, 'વિગત': 0.32838807423750926, 'જાણવી': 0.4600118240628153, 'આજે': 0.2128292066461221, 'પુસ્તુત': 0.7145329310436339, 'પણ': 0.2712827450057881, 'છે': 0.12962369091970113, 'અને': 0.1059952708832195, 'અનુકરણીય': 0.497093247790335, 'આ': 0.12048087663425744, 'સ્થળે': 0.2997364744699214, '૩૫૦૦': 0.4729883306653107, 'એકર': 0.3906506943568616, 'ઉજજડ': 0.60359177158424, 'જમીનમાં': 0.35845603796074227, '૪૦': 0.33521745364434646, 'હજાર': 0.26778155010653365, 'જેટલાં': 0.39435915068008837, 'નિરાશ્રિતો': 0.6016975059977085, 'તંબુ': 0.46555357572991507, 'બાંધીને': 0.39809338997822113, 'રહેતાં': 0.3475133977362111, 'હતાં.': 0.26507229153366646}


<h1>convert tf-idf to aligned vectors

In [5]:
from scipy.sparse import lil_matrix, save_npz, load_npz

def tfidf_dicts_to_sparse(tfidf_list, vocab):
    """
    Converts a list of TF-IDF dictionaries to a sparse LIL matrix.
    """
    matrix = lil_matrix((len(tfidf_list), len(vocab)), dtype=float)
    word_index = {word: idx for idx, word in enumerate(vocab)}
    
    for i, tfidf in enumerate(tfidf_list):
        for word, value in tfidf.items():
            if word in word_index:
                matrix[i, word_index[word]] = value
    return matrix

In [7]:
# ------------------------------
# Generate vocabulary dictionary from training TF-IDF data
# ------------------------------

# Create an empty vocab dictionary
vocab = {}

# Go through each training TF-IDF dictionary and assign an index to each unique token
for doc in train_tfidf:
    for word in doc.keys():
        if word not in vocab:
            vocab[word] = len(vocab)

print(f"✅ Vocabulary built successfully with {len(vocab)} unique tokens.")

✅ Vocabulary built successfully with 6226458 unique tokens.


In [8]:
train_sparse = tfidf_dicts_to_sparse(train_tfidf, vocab).tocsr()
validation_sparse = tfidf_dicts_to_sparse(validation_tfidf, vocab).tocsr()
test_sparse = tfidf_dicts_to_sparse(test_tfidf, vocab).tocsr()

In [9]:
save_npz("train_tfidf.npz", train_sparse)
save_npz("validation_tfidf.npz", validation_sparse)
save_npz("test_tfidf.npz", test_sparse)

# To load later:
validation_sparse = load_npz("validation_tfidf.npz")
test_sparse = load_npz("test_tfidf.npz")

In [10]:
from scipy.sparse import load_npz

# Define the filenames
train_file = "train_tfidf.npz"
validation_file = "validation_tfidf.npz"
test_file = "test_tfidf.npz"

try:
    # Load the sparse matrices from the .npz files
    train_vectors = load_npz(train_file)
    validation_vectors = load_npz(validation_file)
    test_vectors = load_npz(test_file)

    # Print the shapes to confirm they loaded correctly
    print("✅ Files loaded successfully.")
    print(f"Train vectors shape:      {train_vectors.shape}")
    print(f"Validation vectors shape: {validation_vectors.shape}")
    print(f"Test vectors shape:       {test_vectors.shape}")

    # Example: Print the first vector from the training data
    # This will show its sparse representation (column indices and TF-IDF values)
    print("\n--- First Training Vector (Sparse Format) ---")
    print(train_vectors[0])

except FileNotFoundError as e:
    print(f"❌ Error: Could not find the file.")
    print(f"Details: {e}")
    print("Please make sure the .npz files are in the same directory as this script.")

✅ Files loaded successfully.
Train vectors shape:      (30935118, 6226458)
Validation vectors shape: (1000, 6226458)
Test vectors shape:       (1000, 6226458)

--- First Training Vector (Sparse Format) ---
<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 23 stored elements and shape (1, 6226458)>
  Coords	Values
  (0, 0)	0.2081278205151952
  (0, 1)	0.32838807423750926
  (0, 2)	0.4600118240628153
  (0, 3)	0.2128292066461221
  (0, 4)	0.7145329310436339
  (0, 5)	0.2712827450057881
  (0, 6)	0.12962369091970113
  (0, 7)	0.1059952708832195
  (0, 8)	0.497093247790335
  (0, 9)	0.12048087663425744
  (0, 10)	0.2997364744699214
  (0, 11)	0.4729883306653107
  (0, 12)	0.3906506943568616
  (0, 13)	0.60359177158424
  (0, 14)	0.35845603796074227
  (0, 15)	0.33521745364434646
  (0, 16)	0.26778155010653365
  (0, 17)	0.39435915068008837
  (0, 18)	0.6016975059977085
  (0, 19)	0.46555357572991507
  (0, 20)	0.39809338997822113
  (0, 21)	0.3475133977362111
  (0, 22)	0.26507229153366646


<h1>For each sentence in the validation and testing sets, find its nearest neighbor in that set
using TF-IDF vectors.

In [None]:
import pickle
from scipy.sparse import load_npz
from sklearn.neighbors import NearestNeighbors

def load_sentences(filepath):
    """Reads a file and returns a list of sentences."""
    with open(filepath, 'r', encoding='utf-8') as f:
        sentences = [line.strip() for line in f if line.strip()]
    return sentences

if __name__ == "__main__":
    print("--- Finding Nearest Neighbors using TF-IDF Vectors ---")

    try:
        print("\nLoading TF-IDF vectors and original sentences...")
        validation_vectors = load_npz("tfidf_validation_vectors.npz")
        test_vectors = load_npz("tfidf_test_vectors.npz")
        
        validation_sentences = load_sentences("validation.txt")
        test_sentences = load_sentences("test.txt")

        print("\n--- Processing Validation Set ---")
        
        # 'cosine' metric is used for TF-IDF. The algorithm will find vectors with the smallest angle.
        nn_model_val = NearestNeighbors(n_neighbors=2, metric='cosine', algorithm='brute')
        nn_model_val.fit(validation_vectors)

        # Find the neighbors for all sentences in the validation set
        distances_val, indices_val = nn_model_val.kneighbors(validation_vectors)
        
        print("\nShowing 5 example sentences and their nearest neighbors from the validation set:\n")
        for i in range(5):
            original_sentence = validation_sentences[i]
            neighbor_index = indices_val[i][1]
            neighbor_sentence = validation_sentences[neighbor_index]
            # The model returns cosine distance. Similarity = 1 - distance.
            similarity_score = 1 - distances_val[i][1]
            
            print(f"Original Sentence ({i}):\n  '{original_sentence}'")
            print(f"Nearest Neighbor (sentence {neighbor_index}):\n  '{neighbor_sentence}'")
            print(f"Cosine Similarity: {similarity_score:.4f}\n" + "-"*50)


        # --- 3. Find Nearest Neighbors in the Test Set ---
        print("\n\n--- Processing Test Set ---")
        nn_model_test = NearestNeighbors(n_neighbors=2, metric='cosine', algorithm='brute')
        nn_model_test.fit(test_vectors)

        distances_test, indices_test = nn_model_test.kneighbors(test_vectors)

        print("\nShowing 5 example sentences and their nearest neighbors from the test set:\n")
        for i in range(5):
            original_sentence = test_sentences[i]
            neighbor_index = indices_test[i][1]
            neighbor_sentence = test_sentences[neighbor_index]
            similarity_score = 1 - distances_test[i][1]
            
            print(f"Original Sentence ({i}):\n  '{original_sentence}'")
            print(f"Nearest Neighbor (sentence {neighbor_index}):\n  '{neighbor_sentence}'")
            print(f"Cosine Similarity: {similarity_score:.4f}\n" + "-"*50)


    except FileNotFoundError as e:
        print(f"\nError: {e}.")
        print("Please make sure you have successfully run the 'tfidf_vectorizer.py' script first,")
        print("and that the .npz and .txt files are in the same directory.")


--- Finding Nearest Neighbors using TF-IDF Vectors ---

Loading TF-IDF vectors and original sentences...

--- Processing Validation Set ---

Showing 5 example sentences and their nearest neighbors from the validation set:

Original Sentence (0):
  'વાસ્તવિક અંકુશ રેખા નજીક આવેલા પેન્ગોગ ત્સો એટલે કે સરોવર સુધી પહોંચવું અત્યારે પણ ઘણું મુશ્કેલ છે.'
Nearest Neighbor (sentence 750):
  'અરુણાચલ પ્રદેશમાં ભારત અને ચીન વાસ્તવિક નિયંત્રણ રેખા પર સેનાના જવાનો સામસામે આવી ગયા હતા.'
Cosine Similarity: 0.1514
--------------------------------------------------
Original Sentence (1):
  '#1.'
Nearest Neighbor (sentence 0):
  'વાસ્તવિક અંકુશ રેખા નજીક આવેલા પેન્ગોગ ત્સો એટલે કે સરોવર સુધી પહોંચવું અત્યારે પણ ઘણું મુશ્કેલ છે.'
Cosine Similarity: 0.0000
--------------------------------------------------
Original Sentence (2):
  'અને બીજાં નવા નિશાળિયા.'
Nearest Neighbor (sentence 239):
  'આઈપીએલ-14 અગાઉ ટીમો નવા ખેલાડીઓ ખરીદવા માટે આતુર છે.'
Cosine Similarity: 0.1090
-----------------------------------

In [14]:
import pickle
import csv
from scipy.sparse import load_npz
from sklearn.neighbors import NearestNeighbors
from tqdm import tqdm

# ----------------- Helper Functions -----------------

def load_sentences(filepath, limit=None):
    """Reads a file and returns a list of sentences, with an optional limit."""
    sentences = []
    with open(filepath, 'r', encoding='utf-8') as f:
        for i, line in enumerate(f):
            if limit and i >= limit:
                break
            if line.strip():
                sentences.append(line.strip())
    return sentences

# ----------------- Main Execution -----------------

if __name__ == "__main__":
    print("--- Finding Nearest Neighbors using TF-IDF Vectors ---")

    try:
        # --- 1. Load the TF-IDF vectors and the original text sentences ---
        print("\nLoading TF-IDF vectors and original sentences...")
        train_vectors_full = load_npz("tfidf_train_vectors.npz")
        validation_vectors = load_npz("tfidf_validation_vectors.npz")
        test_vectors = load_npz("tfidf_test_vectors.npz")
        
        # We will process a subset of the training data to get 10,000 lines
        NUM_TRAIN_SAMPLES = 10000
        train_sentences_subset = load_sentences("train.txt", limit=NUM_TRAIN_SAMPLES)
        train_vectors_subset = train_vectors_full[:NUM_TRAIN_SAMPLES]

        validation_sentences = load_sentences("validation.txt")
        test_sentences = load_sentences("test.txt")

        # --- 2. Find Nearest Neighbors in the Training Subset ---
        print("\n--- Processing Training Set Subset ---")
        nn_model_train = NearestNeighbors(n_neighbors=2, metric='cosine', algorithm='brute')
        nn_model_train.fit(train_vectors_subset)
        distances_train, indices_train = nn_model_train.kneighbors(train_vectors_subset)

        output_train_filename = "train_subset_nearest_neighbors.csv"
        with open(output_train_filename, 'w', encoding='utf-8', newline='') as f:
            writer = csv.writer(f)
            writer.writerow(['original_sentence', 'nearest_neighbor', 'cosine_similarity'])
            
            print(f"Saving all {NUM_TRAIN_SAMPLES} training subset neighbors to {output_train_filename}...")
            for i in tqdm(range(len(train_sentences_subset)), desc="Training Subset Neighbors"):
                original_sentence = train_sentences_subset[i]
                neighbor_index = indices_train[i][1]
                neighbor_sentence = train_sentences_subset[neighbor_index]
                similarity_score = 1 - distances_train[i][1]
                writer.writerow([original_sentence, neighbor_sentence, f"{similarity_score:.4f}"])
        
        # --- 3. Find Nearest Neighbors in the Validation Set ---
        print("\n--- Processing Validation Set ---")
        nn_model_val = NearestNeighbors(n_neighbors=2, metric='cosine', algorithm='brute')
        nn_model_val.fit(validation_vectors)
        distances_val, indices_val = nn_model_val.kneighbors(validation_vectors)
        
        output_val_filename = "validation_nearest_neighbors.csv"
        with open(output_val_filename, 'w', encoding='utf-8', newline='') as f:
            writer = csv.writer(f)
            writer.writerow(['original_sentence', 'nearest_neighbor', 'cosine_similarity'])
            
            print(f"Saving all validation neighbors to {output_val_filename}...")
            for i in tqdm(range(len(validation_sentences)), desc="Validation Neighbors"):
                original_sentence = validation_sentences[i]
                neighbor_index = indices_val[i][1]
                neighbor_sentence = validation_sentences[neighbor_index]
                similarity_score = 1 - distances_val[i][1]
                writer.writerow([original_sentence, neighbor_sentence, f"{similarity_score:.4f}"])

        # --- 4. Find Nearest Neighbors in the Test Set ---
        print("\n\n--- Processing Test Set ---")
        nn_model_test = NearestNeighbors(n_neighbors=2, metric='cosine', algorithm='brute')
        nn_model_test.fit(test_vectors)
        distances_test, indices_test = nn_model_test.kneighbors(test_vectors)

        output_test_filename = "test_nearest_neighbors.csv"
        with open(output_test_filename, 'w', encoding='utf-8', newline='') as f:
            writer = csv.writer(f)
            writer.writerow(['original_sentence', 'nearest_neighbor', 'cosine_similarity'])
            
            print(f"Saving all test neighbors to {output_test_filename}...")
            for i in tqdm(range(len(test_sentences)), desc="Test Neighbors"):
                original_sentence = test_sentences[i]
                neighbor_index = indices_test[i][1]
                neighbor_sentence = test_sentences[neighbor_index]
                similarity_score = 1 - distances_test[i][1]
                writer.writerow([original_sentence, neighbor_sentence, f"{similarity_score:.4f}"])

    except FileNotFoundError as e:
        print(f"\nError: {e}.")
        print("Please make sure you have successfully run the 'tfidf_vectorizer.py' script first,")
        print("and that the .npz and .txt files are in the same directory.")

--- Finding Nearest Neighbors using TF-IDF Vectors ---

Loading TF-IDF vectors and original sentences...



--- Processing Training Set Subset ---
Saving all 10000 training subset neighbors to train_subset_nearest_neighbors.csv...


Training Subset Neighbors: 100%|██████████| 10000/10000 [00:00<00:00, 230351.21it/s]



--- Processing Validation Set ---
Saving all validation neighbors to validation_nearest_neighbors.csv...


Validation Neighbors: 100%|██████████| 1000/1000 [00:00<00:00, 214740.12it/s]




--- Processing Test Set ---
Saving all test neighbors to test_nearest_neighbors.csv...


Test Neighbors: 100%|██████████| 1000/1000 [00:00<00:00, 175839.68it/s]
