# Sentiment Analysis

Using the Loughran-McDonald_MasterDictionary.

In [None]:
import pandas as pd
# load publishers data from the corresponding file
publishers = pd.read_csv("../data/processed/publishers.csv") 

In [None]:
# --- Monthly Composite Sentiment for all publishers using spaCy ---

import pandas as pd
import spacy
import re

# 1) spaCy setup
nlp = spacy.load("en_core_web_sm", disable=["parser","ner"])

# 2) load Loughran–McDonald dictionary
lmd = pd.read_csv("../data/raw/Loughran-McDonald_MasterDictionary_1993-2024.csv")
pos_words = set(lmd.loc[lmd.Positive > 0, "Word"].str.lower())
neg_words = set(lmd.loc[lmd.Negative > 0, "Word"].str.lower())

# 3) helper to count pos/neg tokens
def count_sentiment(text):
    doc = nlp(str(text))
    pos = neg = 0
    for tok in doc:
        w = tok.text.lower()
        if tok.is_alpha:
            if w in pos_words: pos += 1
            if w in neg_words: neg += 1
    return pd.Series({"pos": pos, "neg": neg})


In [None]:
# 4) loop over publisher samples and compute monthly sentiment
monthly_records = []
for pub in publishers['publication']:
    safe   = re.sub(r'\W+', '_', pub.lower()).strip('_')
    # removed infer_datetime_format (now default)
    sample = pd.read_csv(f"../data/processed/newspapers/sample_{safe}.csv", parse_dates=['date'])
    sample = sample.dropna(subset=['date'])
    texts  = sample['article'].astype(str).tolist()

    # batch‐process spaCy
    docs = nlp.pipe(texts, batch_size=500)
    sent_counts = []
    for doc in docs:
        pos = sum(1 for tok in doc if tok.is_alpha and tok.text.lower() in pos_words)
        neg = sum(1 for tok in doc if tok.is_alpha and tok.text.lower() in neg_words)
        sent_counts.append((pos, neg))

    sample[['pos','neg']] = pd.DataFrame(sent_counts, index=sample.index)

    agg = (
        sample
          .set_index('date')
          # use 'ME' (month end) instead of deprecated 'M'
          .resample('ME')
          .agg({'pos':'sum','neg':'sum'})
          .assign(
            sentiment=lambda df: (df.pos - df.neg)/(df.pos + df.neg + 1e-9),
            publication=pub
          )
          .reset_index()[['date','publication','sentiment']]
    )
    monthly_records.append(agg)

df_monthly_all = pd.concat(monthly_records, ignore_index=True)
# save df_monthly_all as csv
df_monthly_all.to_csv("../data/processed/monthly_sentiment_all.csv", index=False)
display(df_monthly_all)

In [None]:
# Plot monthly sentiment for all publishers in one figure
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 6))
for pub, grp in df_monthly_all.groupby('publication'):
    plt.plot(grp['date'], grp['sentiment'], marker='o', linestyle='-', label=pub)

plt.title('Monthly Composite Sentiment for All Publishers')
plt.xlabel('Month')
plt.ylabel('Sentiment Score')
plt.legend(title='Publisher', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
# Plot monthly sentiment for all publishers: grey lines + black average line

plt.figure(figsize=(12, 6))

# plot each publisher in grey
for _, grp in df_monthly_all.groupby('publication'):
    plt.plot(grp['date'], grp['sentiment'],
             color='grey', alpha=0.5, linewidth=1)

# compute and plot the average sentiment per month
avg = df_monthly_all.groupby('date')['sentiment'].mean().reset_index()
plt.plot(avg['date'], avg['sentiment'],
         color='black', marker='o', linestyle='-',
         linewidth=2, label='Average')

plt.title('Monthly Composite Sentiment for All Publishers')
plt.xlabel('Month')
plt.ylabel('Sentiment Score')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()