In [9]:
import pandas as pd
import math
from collections import Counter

# --------------------------
# 1Ô∏è‚É£ Load unigram and bigram CSVs
# --------------------------

def load_ngrams_csv(path):
    df = pd.read_csv(path)
    df["Ngram"] = df["Ngram"].astype(str)
    df["Count"] = df["Count"].astype(int)
    return df

unigram_df    = load_ngrams_csv("C:/Users/ashis/OneDrive/Desktop/NLP/Lab4/unigram.csv")
bigram_df     = load_ngrams_csv("C:/Users/ashis/OneDrive/Desktop/NLP/Lab4/bigram.csv")


In [10]:

# --------------------------
# 2Ô∏è‚É£ Build unigram and bigram counters
# --------------------------
unigrams = Counter({row.Ngram.strip(): row.Count for _, row in unigram_df.iterrows()})
bigrams = Counter({tuple(row.Ngram.strip().split()): row.Count for _, row in bigram_df.iterrows()})

# --------------------------
# 3Ô∏è‚É£ Compute PMI
# --------------------------
total_unigrams = sum(unigrams.values())
total_bigrams  = sum(bigrams.values())

p_unigram = {w: c / total_unigrams for w, c in unigrams.items()}
p_bigram  = {bg: c / total_bigrams for bg, c in bigrams.items()}

pmi_scores = {}
for (w1, w2), p_xy in p_bigram.items():
    if w1 in p_unigram and w2 in p_unigram:
        p_x, p_y = p_unigram[w1], p_unigram[w2]
        if p_x > 0 and p_y > 0:
            pmi_scores[(w1, w2)] = math.log2(p_xy / (p_x * p_y))

# --------------------------
# 4Ô∏è‚É£ Export and Display
# --------------------------
pmi_df = pd.DataFrame([
    {"Bigram": f"{w1} {w2}", "PMI": pmi}
    for (w1, w2), pmi in sorted(pmi_scores.items(), key=lambda x: x[1], reverse=True)
])

print(f"\n‚úÖ Computed {len(pmi_df)} PMI scores")
print("\nTop 10 Bigram PMI Scores:")
print(pmi_df.head(10))
print("\nBottom 10 Bigram PMI Scores:")
print(pmi_df.tail(10))

pmi_df.to_csv("bigram_PMI_scores.csv", index=False)
print("\nüíæ Saved to bigram_PMI_scores.csv")



‚úÖ Computed 200000 PMI scores

Top 10 Bigram PMI Scores:
                Bigram        PMI
0  ‡§Ø‡•Ç‡§ï‡•ç‡§∞‡•á‡§®‡§ø‡§Ø‡§® ‡§∞‡§ø‡§µ‡•ç‡§®‡§ø‡§Ø‡§æ  21.321390
1       Sourav Ganguly  21.321390
2                  üè£ üè£  21.195859
3                  üé† üé†  21.195859
4                  üî• üî•  21.195859
5                  ü´ê ü´ê  21.195859
6       Ajinkya Rahane  21.098997
7         Saina Nehwal  21.098997
8      Ujjawal Prabhat  21.098997
9             Bird Flu  21.080381

Bottom 10 Bigram PMI Scores:
          Bigram        PMI
199990     ‡§ï‡•Ä ‡§∏‡•á  -8.692190
199991     ‡§ï‡•ã ‡§ï‡•á  -8.705414
199992    <s> ‡§π‡•à  -9.065262
199993    ‡§ï‡•á ‡§Æ‡•á‡§Ç  -9.075555
199994  <s> ‡§ï‡§∞‡§®‡•á  -9.180919
199995   <s> ‡§π‡•à‡•§  -9.193039
199996    <s> ‡§≠‡•Ä  -9.237039
199997    ‡§ï‡•Ä ‡§Æ‡•á‡§Ç  -9.237284
199998     <s> ,  -9.959042
199999   <s> ‡§≤‡§ø‡§è -10.096298

üíæ Saved to bigram_PMI_scores.csv
