In [None]:
import nltk
df["tokens"] = df["full_text"].apply(lambda x: nltk.word_tokenize(x.lower()))

In [None]:
import re

def contains_keyword(text, keyword_list):
    text = text.lower()
    return any(re.search(r'\b' + re.escape(k.lower()) + r'\b', text) for k in keyword_list)

# Add one binary column per frame
for frame_name, keyword_list in frames.items():
    df[frame_name] = df['full_text'].apply(lambda x: 1 if contains_keyword(x, keyword_list) else 0)

In [None]:
from sklearn.model_selection import train_test_split

train_df, dev_df = train_test_split(
    df, test_size=0.2, random_state=42, stratify=df["frame"]
)

train_y = train_df["frame"]
dev_y = dev_df["frame"]

In [None]:
vectorizer, train_ngrams = ngram_feats(
    train_df,
    field_name="tokens",
    return_vectorizer=True,
    ngram_range=(1,2),
    max_features=3000
)

dev_ngrams = ngram_feats(
    dev_df,
    field_name="tokens",
    vectorizer=vectorizer,
    transform_only=True
)


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

frame_models = {}
frame_results = {}

for frame_name in frames.keys():

    clf = LogisticRegression(max_iter=500)
    clf.fit(train_ngrams, train_df[frame_name])

    preds = clf.predict(dev_ngrams)

    acc = accuracy_score(dev_df[frame_name], preds)
    report = classification_report(dev_df[frame_name], preds)

    frame_models[frame_name] = clf
    frame_results[frame_name] = {
        "accuracy": acc,
        "report": report
    }

    print("="*80)
    print(f"FRAME: {frame_name}")
    print("Accuracy:", acc)
    print(report)


In [None]:
from sklearn.feature_extraction.text import CountVectorizer

def ngram_feats(df,
                field_name="full_text",
                ngram_range=(1,2),
                max_features=3000,
                return_vectorizer=False,
                vectorizer=None,
                transform_only=False):

    if transform_only:
        X = vectorizer.transform(df[field_name])
        if return_vectorizer:
            return vectorizer, X
        return X

    # Fit a new vectorizer
    vectorizer = CountVectorizer(
        ngram_range=ngram_range,
        max_features=max_features,
        stop_words="english"
    )
    
    X = vectorizer.fit_transform(df[field_name])

    if return_vectorizer:
        return vectorizer, X
    return X


In [None]:
vectorizer, train_X = ngram_feats(
    train_df,
    field_name="full_text",
    ngram_range=(1,2),
    max_features=4000,
    return_vectorizer=True
)

dev_X = ngram_feats(
    dev_df,
    field_name="full_text",
    vectorizer=vectorizer,
    transform_only=True
)


In [None]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(
    max_iter=2000,
    class_weight="balanced",
    n_jobs=-1
)

clf.fit(train_X, train_y)


In [None]:
yhat = clf.predict(dev_X)


In [None]:
from sklearn.metrics import accuracy_score

acc = accuracy_score(dev_y, yhat)
print("Accuracy:", acc)


In [None]:
from sklearn.metrics import classification_report

print(classification_report(dev_y, yhat))


In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

cm = confusion_matrix(dev_y, yhat, labels=clf.classes_)

sns.heatmap(cm, annot=True, fmt="d", xticklabels=clf.classes_, yticklabels=clf.classes_)
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()
