## Imports + GPU Setup

In [2]:
import os

os.environ["WANDB_DISABLED"] = "true"
os.environ["CUDA_VISIBLE_DEVICES"] = "2"
os.environ["TOKENIZERS_PARALLELISM"] = "false"


import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import logging
import re
import nltk


from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from copy import deepcopy
from urllib import request
from dont_patronize_me import DontPatronizeMe # data manager module
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from torch.optim import lr_scheduler
import torch.nn.functional as F
import torch.optim as optim

from transformers import RobertaModel, RobertaTokenizer
from simpletransformers.classification import ClassificationModel, ClassificationArgs

from preprocessing import load_data, preprocess_data, DPMDataset

logging.basicConfig(level=logging.ERROR)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'device: {device}')



device: cuda


In [1]:
import torch
print(torch.cuda.is_available())
print(torch.cuda.device_count())
print(torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU detected")

True
1
NVIDIA A100 80GB PCIe MIG 1g.10gb


## Data Setup

In [3]:
train_df, dev_df, test_df = load_data()

# downsample negative instances
pcldf = train_df[train_df.label==1]
npos = len(pcldf)
balanced_train_df = pd.concat([pcldf, train_df[train_df.label==0][:int(2.5*npos)]])
balanced_train_df = balanced_train_df[['text', 'community', 'label', 'country']]

In [16]:
processed_train_df = preprocess_data(balanced_train_df, clean_data=False, augment_data=False, add_country=False, add_community=False)
processed_dev_df = preprocess_data(dev_df, clean_data=False, add_country=False, add_community=False)
processed_test_df = preprocess_data(test_df, clean_data=False, add_country=False, add_community=False)

## TF-IDF with logistic Regression

In [18]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, f1_score

# Data
X_train, y_train = processed_train_df["text"], processed_train_df["label"]
X_dev, y_dev = processed_dev_df["text"], processed_dev_df["label"]

# Transform text in BoW
vectorizer = TfidfVectorizer(ngram_range=(1,3), max_features=10000)  
X_train_tfidf = vectorizer.fit_transform(X_train)
X_dev_tfidf = vectorizer.transform(X_dev)

# Logistic regression
logreg = LogisticRegression(max_iter=1000, class_weight="balanced", random_state=42)
logreg.fit(X_train_tfidf, y_train)

# Predictions and evaluation
y_pred = logreg.predict(X_dev_tfidf)
f1_tfidf = f1_score(y_dev, y_pred)
print(f"F1 Score with TF-IDF: {f1_tfidf:.4f}")

# Find misclassified examples
misclassified = processed_dev_df.iloc[(y_pred != y_dev).to_numpy()]

# Print one example
example = misclassified.sample(1, random_state=42)
print("\nExample of misclassified text:")
print(example[["text", "label"]])


F1 Score with TF-IDF: 0.3755

Example of misclassified text:
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               text  \
1249  The casting was gut-punchingly faithful and , with very few exceptions , matched up with the pictures in my head , the story stayed true , and somehow managed to unfold in a way that entirely satisfied the vast majority of those who loved the books without leaving those who had n't ( yet ) done so hopelessly lost . It was a Hollywood miracle -- and finally , I had the ultimate challenge for thos

## TF-IDF with SVM

In [21]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, f1_score
from sklearn.svm import LinearSVC

# Data
X_train, y_train = processed_train_df["text"], processed_train_df["label"]
X_dev, y_dev = processed_dev_df["text"], processed_dev_df["label"]

# Transform text in BoW
vectorizer = TfidfVectorizer(ngram_range=(1,3), max_features=10000)  
X_train_tfidf = vectorizer.fit_transform(X_train)
X_dev_tfidf = vectorizer.transform(X_dev)

# SVM model
svm_model = LinearSVC(class_weight="balanced", random_state=42)
svm_model.fit(X_train_tfidf, y_train)

# Predictions and evaluation
y_pred_svm = svm_model.predict(X_dev_tfidf)
f1_svm = f1_score(y_dev, y_pred_svm)
print(f"F1 Score with SVM: {f1_svm:.4f}")

# Find misclassified examples
misclassified = processed_dev_df.iloc[(y_pred != y_dev).to_numpy()]

# Print one example
example = misclassified.sample(1, random_state=42)
print("\nExample of misclassified text:")
print(example[["text", "label"]])

F1 Score with SVM: 0.3871

Example of misclassified text:
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               text  \
1249  The casting was gut-punchingly faithful and , with very few exceptions , matched up with the pictures in my head , the story stayed true , and somehow managed to unfold in a way that entirely satisfied the vast majority of those who loved the books without leaving those who had n't ( yet ) done so hopelessly lost . It was a Hollywood miracle -- and finally , I had the ultimate challenge for those s

In [23]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter

# Identify false positives (FPs) where the model predicted PCL but the true label is 0
false_positives = processed_dev_df[(y_pred_svm == 1) & (y_dev == 0)]["text"]

# Vectorizer setup for bigrams
vectorizer = TfidfVectorizer(ngram_range=(2,2), stop_words="english", max_features=5000)
X_fp_tfidf = vectorizer.fit_transform(false_positives)

# Get bigram feature names
bigram_features = vectorizer.get_feature_names_out()

# Convert TF-IDF matrix to a frequency count
word_counts = np.asarray(X_fp_tfidf.sum(axis=0)).flatten()

# Create a dictionary of bigram -> count
bigram_freq = dict(zip(bigram_features, word_counts))

# Sort bigrams by frequency
sorted_bigrams = sorted(bigram_freq.items(), key=lambda x: x[1], reverse=True)

# Display the top 20 bigrams
print("\n**Top 20 Bigrams in False Positives:**")
for bigram, count in sorted_bigrams[:20]:
    print(f"{bigram}: {count:.2f}")



**Top 20 Bigrams in False Positives:**
poor families: 6.58
homeless people: 2.29
children poor: 1.28
young people: 1.25
vulnerable children: 1.12
homeless man: 1.12
year old: 1.11
come poor: 1.06
homeless person: 1.03
disabled children: 0.99
women children: 0.96
people need: 0.94
refugee camps: 0.89
men women: 0.87
asylum seekers: 0.85
years ago: 0.84
hopeless situation: 0.82
illegal immigrants: 0.79
help need: 0.79
disabled people: 0.78
