In [1]:
import os
import itertools
import re
import time

import pandas as pd
import numpy as np
import networkx as nx

import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# -------------------------------------------------------------
# 0. Download VADER lexicon (once per session)
# -------------------------------------------------------------
nltk.download("vader_lexicon")

sia = SentimentIntensityAnalyzer()

# -------------------------------------------------------------
# 1. Load dataset
# -------------------------------------------------------------
df = pd.read_csv("df_ungrouped_with_sqldate_clean_2025-11-12_1546 1.csv")

# Group by TEXT_FILE
groups = list(df.groupby("TEXT_FILE"))
groups = sorted(groups, key=lambda r: r[1].shape[0], reverse=True)

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [5]:
# -------------------------------------------------------------
# 2. Path to the real text files
# -------------------------------------------------------------
TEXTS_FOLDER = "/content/drive/MyDrive/NLPproject/output_processed/texts"

# -------------------------------------------------------------
# 3. Dictionary to store final results
# -------------------------------------------------------------
entity_pair_stats = {}

In [6]:
# -------------------------------------------------------------
# Helper: fast sentence splitting using regex
# -------------------------------------------------------------
def fast_sentence_split(text):
    sentences = re.split(r"(?<=[.!?])\s+", text)
    return [s.strip() for s in sentences if s.strip()]

In [10]:
# -------------------------------------------------------------
# 4. Main loop (test first with only a few files)
# -------------------------------------------------------------

start_all = time.time()

for idx, (text_file, rows) in enumerate(groups):
    print(f"\nProcessing file {idx+1}/{len(groups)}: {text_file}")
    t_file_start = time.time()

    file_path_full = os.path.join(TEXTS_FOLDER, text_file)

    if not os.path.exists(file_path_full):
        print("  File not found:", file_path_full)
        continue

    # ----- Read file -----
    with open(file_path_full, "r", encoding="utf-8") as f:
        text = f.read()

    # ----- Fast sentence split -----
    sentences = fast_sentence_split(text)

    # ----- Compute sentiment once per sentence -----
    sentence_sentiment = []
    for s in sentences:
        score = sia.polarity_scores(s)["compound"]
        sentence_sentiment.append(score)

    # ----- Extract entity/mention pairs for this article -----
    labels = list(rows["entity"].values)
    mentions = list(rows["entity_original"].values)

    article_entities = [
        {"entity": e, "mention": m}
        for e, m in zip(labels, mentions)
    ]

    # OPTIONAL: prevent huge O(n^2) blow-up
    MAX_ENTITIES_PER_FILE = 40
    if len(article_entities) > MAX_ENTITIES_PER_FILE:
        article_entities = article_entities[:MAX_ENTITIES_PER_FILE]

    # ----- For each sentence, store which entities appear -----
    entity_to_sent_idxs = {e["entity"]: set() for e in article_entities}

    for i, s in enumerate(sentences):
        for e in article_entities:
            if e["mention"] in s:
                entity_to_sent_idxs[e["entity"]].add(i)

    # Precompute counts for co-occurrence using ONLY this file
    counts_per_entity = rows.set_index("entity")["count"].to_dict()

    # ----- For each entity pair, find common sentences -----
    for e1, e2 in itertools.combinations(article_entities, 2):
        ent1 = e1["entity"]
        ent2 = e2["entity"]

        sent_idxs1 = entity_to_sent_idxs.get(ent1, set())
        sent_idxs2 = entity_to_sent_idxs.get(ent2, set())
        common_sent_idxs = sent_idxs1 & sent_idxs2

        if not common_sent_idxs:
            continue

        # co-occurrence = min count in this document
        c1 = counts_per_entity.get(ent1, 0)
        c2 = counts_per_entity.get(ent2, 0)
        co_occurrence = min(c1, c2)

        # collect sentiments
        sentiments = [sentence_sentiment[i] for i in common_sent_idxs]

        entity_key = tuple(sorted([ent1, ent2]))

        if entity_key not in entity_pair_stats:
            entity_pair_stats[entity_key] = {
                "sentiment": [],
                "co_occurrence": co_occurrence,
            }

        entity_pair_stats[entity_key]["sentiment"].extend(sentiments)

    t_file_end = time.time()
    print("  Time for this file:", round(t_file_end - t_file_start, 2), "seconds")

end_all = time.time()
print("\nTotal time:", round(end_all - start_all, 2), "seconds")
print("Avg per file:", round((end_all - start_all) / max(1, idx+1), 2), "seconds")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Processing file 22550/24215: 021554_www_startribune_com_tesla-reports-11-sales-drop-for-2024-first-annual-decline-in-a-decade_601201182.txt
  Time for this file: 0.29 seconds

Processing file 22551/24215: 021555_www_startribune_com_tolkkinen-minnesota-benefactor-runs-out-of-funds-and-energy-in-quest-to-help-afghan-families-who-onc.txt
  Time for this file: 0.29 seconds

Processing file 22552/24215: 021557_www_statecollege_com_articles_political-news_what-trumps-solar-crackdown-could-mean-for-pennsylvania-energy-bills.txt
  Time for this file: 0.26 seconds

Processing file 22553/24215: 021560_www_statehouse_gov_sc_news_6338_president-ramkalawan-shares-seychelles-perspective-on-harnessing-blue-economy--marine-co.txt
  Time for this file: 0.26 seconds

Processing file 22554/24215: 021561_www_statesman_com_business_article_starbase-elon-musk-spacex-texas-beach-closures-21079306_php.txt
  Time for this file: 0.27 seconds

Proc

In [11]:
# -------------------------------------------------------------
# 5. Build entity graph
# -------------------------------------------------------------
G = nx.Graph()

for entity_key, entry in entity_pair_stats.items():
    ent1, ent2 = entity_key
    sentiments = entry["sentiment"]
    co_occurrence = entry["co_occurrence"]

    G.add_edge(
        ent1,
        ent2,
        avg_sentiment=float(np.mean(sentiments)),
        std_sentiment=float(np.std(sentiments)),
        co_occurrence=co_occurrence,
        n_sentences=len(sentiments),
    )

In [12]:
# -------------------------------------------------------------
# 6. Convert results to DataFrame
# -------------------------------------------------------------
rows_out = []

for (ent1, ent2), entry in entity_pair_stats.items():
    sentiments = entry["sentiment"]
    rows_out.append({
        "entity_1": ent1,
        "entity_2": ent2,
        "avg_sentiment": float(np.mean(sentiments)),
        "std_sentiment": float(np.std(sentiments)),
        "co_occurrence": entry["co_occurrence"],
        "n_sentences": len(sentiments),
    })

entity_pairs_df = pd.DataFrame(rows_out)

print("\nPreview:")
entity_pairs_df.head()


Preview:


Unnamed: 0,entity_1,entity_2,avg_sentiment,std_sentiment,co_occurrence,n_sentences
0,South African Government,the US,0.994793,0.001567476,21,54
1,Ministry for Economic Affairs,the US,0.9951,1.110223e-16,45,25
2,the People's Republic of China,the US,0.9951,1.110223e-16,20,23
3,the Russian Federation's,the US,0.9951,1.110223e-16,7,23
4,The Embassy of,the US,0.9951,0.0,15,38


In [13]:
output_path = "/content/drive/MyDrive/NLPproject//output_processed/entity_pairs_results.csv"
entity_pairs_df.to_csv(output_path, index=False)

print("Saved to:", output_path)


Saved to: /content/drive/MyDrive/NLPproject//output_processed/entity_pairs_results.csv
