In [None]:
!pip install transformers

import pandas as pd
import torch
from torch.utils.data import TensorDataset, DataLoader
from torch import nn, optim
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
from wordcloud import WordCloud


In [None]:
train_df = pd.read_csv("/content/olid-training-v1.0.tsv", sep="\t")

# Load test tweet data and label file
test_tweets = pd.read_csv("/content/testset-levela.tsv", sep="\t")
test_labels = pd.read_csv("/content/labels-levela.csv", names=["id", "label"])

# Merging test tweets with labels using ID
test_df = test_tweets.merge(test_labels, on="id")

In [None]:
# Simple tweet cleaner
def clean(text):
    return text.replace("@USER", "").replace("URL", "").strip()

train_df['tweet'] = train_df['tweet'].apply(clean)
test_df['tweet'] = test_df['tweet'].apply(clean)

# Encode labels: NOT → 0, OFF → 1
le = LabelEncoder()
train_df['label'] = le.fit_transform(train_df['subtask_a'])
test_df['label'] = le.transform(test_df['label'])


In [None]:
#cloud
print("\nMaking a word cloud...")
all_text = " ".join(train_df['tweet'])  # Use the cleaned 'tweet' column
wordcloud = WordCloud(width=800, height=400, background_color='black', max_words=200).generate(all_text)

plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title("Word Cloud of Cleaned Tweets")
plt.show()

In [None]:
# This function trains and evaluates a transformer model (like BERT) for binary text classification using Hugging Face and PyTorch.
# It handles tokenization, model training on GPU/CPU, and prints evaluation metrics like precision, recall, and F1-score.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
def train_and_evaluate(model_name, train_df, test_df, label_encoder, epochs=2):
    print(f"\n Training {model_name}...\n")

    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
    model = model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2, ignore_mismatched_sizes=True).to(device)


    def create_loader(df, batch_size=8):
        tokens = tokenizer(list(df['tweet']), truncation=True, padding=True, return_tensors='pt')
        labels = torch.tensor(df['label'].tolist())
        dataset = TensorDataset(tokens['input_ids'], tokens['attention_mask'], labels)
        return DataLoader(dataset, batch_size=batch_size)

    train_loader = create_loader(train_df)
    test_loader = create_loader(test_df)

    optimizer = optim.AdamW(model.parameters(), lr=2e-5)
    loss_fn = nn.CrossEntropyLoss()

    model.train()
    for epoch in range(epochs):
        for input_ids, attn_mask, labels in train_loader:
            input_ids, attn_mask, labels = input_ids.to(device), attn_mask.to(device), labels.to(device)
            outputs = model(input_ids=input_ids, attention_mask=attn_mask)
            loss = loss_fn(outputs.logits, labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        print(f" Epoch {epoch+1} completed")

    # Evaluation
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for input_ids, attn_mask, labels in test_loader:
            input_ids, attn_mask = input_ids.to(device), attn_mask.to(device)
            outputs = model(input_ids=input_ids, attention_mask=attn_mask)
            preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels.numpy())

    print(f"\n Evaluation for {model_name}")
    print(classification_report(all_labels, all_preds, target_names=label_encoder.classes_))


In [None]:
# Model names
models = ['DistilBERT', 'BERTweet', 'ToxicBERT', 'DeBERTa']

# Replace these with your actual F1 scores for Subtask A
f1_scores = [0.79, 0.80, 0.80, 0.81]

# Create a bar chart
plt.figure(figsize=(8, 5))
plt.bar(models, f1_scores, color='skyblue')
plt.ylim(0, 1)
plt.xlabel("Model")
plt.ylabel("Macro F1 Score")
plt.title("F1 Score Comparison - Subtask A")
plt.grid(True)
plt.show()

In [None]:

# Replace these with your actual predictions and labels
y_true = [0, 1, 0, 1, 1, 0, 1, 0]  # Ground truth labels
y_pred = [0, 1, 0, 0, 1, 0, 1, 1]  # Predictions by your model

# Plot the confusion matrix
cm = confusion_matrix(y_true, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["NOT", "OFF"])
disp.plot(cmap='Blues')
plt.title("Confusion Matrix - Subtask A")
plt.show()

In [None]:
# For subtask B DistilBERT BERTweet ToxicBERT DeBERTa
models = {
    "DistilBERT": "distilbert-base-uncased",
    "BERTweet": "vinai/bertweet-base",
    "ToxicBERT": "unitary/toxic-bert",
    "DeBERTa": "microsoft/deberta-v3-small"
}

for name, model_id in models.items():
    train_and_evaluate(model_id, train_df, test_df, le)


In [None]:
# Load Level B test data and labels
test_tweets_b = pd.read_csv("/content/testset-levelb.tsv", sep="\t")
test_labels_b = pd.read_csv("/content/labels-levelb.csv", names=["id", "label"])
test_df_b = test_tweets_b.merge(test_labels_b, on="id")

# Filter OFF-labeled samples for training (Level B requires only OFF)
train_df_b = train_df[train_df['subtask_a'] == 'OFF'].copy()

# Clean
train_df_b['tweet'] = train_df_b['tweet'].apply(clean)
test_df_b['tweet'] = test_df_b['tweet'].apply(clean)

# Encode labels
le_b = LabelEncoder()
train_df_b['label'] = le_b.fit_transform(train_df_b['subtask_b'])  # TIN or UNT
test_df_b['label'] = le_b.transform(test_df_b['label'])


In [None]:
# This function trains and evaluates a transformer model (like BERT) for binary text classification using Hugging Face and PyTorch.
# It handles tokenization, model training on GPU/CPU, and prints evaluation metrics like precision, recall, and F1-score.
def train_and_evaluate(model_name, train_df, test_df, label_encoder, epochs=2):
    from transformers import AutoTokenizer, AutoModelForSequenceClassification
    import torch
    from torch.utils.data import DataLoader, TensorDataset
    from torch import nn, optim
    from sklearn.metrics import classification_report

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name, num_labels=len(label_encoder.classes_), ignore_mismatched_sizes=True
    ).to(device)

    def create_loader(df, batch_size=8):
        tokens = tokenizer(list(df['tweet']), truncation=True, padding=True, return_tensors='pt')
        labels = torch.tensor(df['label'].tolist())
        dataset = TensorDataset(tokens['input_ids'], tokens['attention_mask'], labels)
        return DataLoader(dataset, batch_size=batch_size)

    train_loader = create_loader(train_df)
    test_loader = create_loader(test_df)

    optimizer = optim.AdamW(model.parameters(), lr=2e-5)
    loss_fn = nn.CrossEntropyLoss()

    model.train()
    for epoch in range(epochs):
        for input_ids, attn_mask, labels in train_loader:
            input_ids, attn_mask, labels = input_ids.to(device), attn_mask.to(device), labels.to(device)
            outputs = model(input_ids=input_ids, attention_mask=attn_mask)
            loss = loss_fn(outputs.logits, labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        print(f"Epoch {epoch+1} completed")

    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for input_ids, attn_mask, labels in test_loader:
            input_ids, attn_mask = input_ids.to(device), attn_mask.to(device)
            outputs = model(input_ids=input_ids, attention_mask=attn_mask)
            preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels.numpy())

    print(f"\n Evaluation for {model_name} (Level B)")
    print(classification_report(all_labels, all_preds, target_names=label_encoder.classes_))


In [None]:
# For subtask B DistilBERT BERTweet ToxicBERT DeBERTa
models = {
    "DistilBERT": "distilbert-base-uncased",
    "BERTweet": "vinai/bertweet-base",
    "ToxicBERT": "unitary/toxic-bert",
    "DeBERTa": "microsoft/deberta-v3-small"
}

print("\n====== Level B: Insult Type Classification (TIN vs UNT) ======\n")
for name, model_id in models.items():
    train_and_evaluate(model_id, train_df_b, test_df_b, le_b)


In [None]:
# Model names
models = ['DistilBERT', 'BERTweet', 'ToxicBERT', 'DeBERTa']

# Replace these with your actual F1 scores for Subtask B
f1_scores = [0.79, 0.80, 0.80, 0.81]

# Create a bar chart
plt.figure(figsize=(8, 5))
plt.bar(models, f1_scores, color='skyblue')
plt.ylim(0, 1)
plt.xlabel("Model")
plt.ylabel("Macro F1 Score")
plt.title("F1 Score Comparison - Subtask B")
plt.grid(True)
plt.show()

In [None]:

# Replace these with your actual predictions and labels
y_true = [0, 1, 0, 1, 1, 0, 1, 0]  # Ground truth labels
y_pred = [0, 1, 0, 0, 1, 0, 1, 1]  # Predictions by your model

# Plot the confusion matrix
cm = confusion_matrix(y_true, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["NOT", "OFF"])
disp.plot(cmap='Blues')
plt.title("Confusion Matrix - Subtask B")
plt.show()

In [None]:
# Load test tweets and labels (Level C)
test_tweets_c = pd.read_csv("/content/testset-levelc.tsv", sep="\t")
test_labels_c = pd.read_csv("/content/labels-levelc.csv", names=["id", "label"])
test_df_c = test_tweets_c.merge(test_labels_c, on="id")

# Filter training data to only TIN posts (Level C requires TIN from B)
train_df_c = train_df[train_df['subtask_b'] == 'TIN'].copy()

# Clean text
train_df_c['tweet'] = train_df_c['tweet'].apply(clean)
test_df_c['tweet'] = test_df_c['tweet'].apply(clean)

# Encode labels
from sklearn.preprocessing import LabelEncoder
le_c = LabelEncoder()
train_df_c['label'] = le_c.fit_transform(train_df_c['subtask_c'])  # IND, GRP, OTH
test_df_c['label'] = le_c.transform(test_df_c['label'])


In [None]:
!pip3 install emoji==0.6.0

In [None]:
# This function trains and evaluates a transformer model (like BERT) for binary text classification using Hugging Face and PyTorch.
# It handles tokenization, model training on GPU/CPU, and prints evaluation metrics like precision, recall, and F1-score.

def train_and_evaluate(model_name, train_df, test_df, label_encoder, epochs=2):
    from transformers import AutoTokenizer, AutoModelForSequenceClassification
    import torch
    from torch.utils.data import DataLoader, TensorDataset
    from torch import nn, optim
    from sklearn.metrics import classification_report

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=len(label_encoder.classes_),
        ignore_mismatched_sizes=True  # especially needed for ToxicBERT
    ).to(device)

    def create_loader(df, batch_size=8):
        tokens = tokenizer(list(df['tweet']), truncation=True, padding=True, return_tensors='pt')
        labels = torch.tensor(df['label'].tolist())
        dataset = TensorDataset(tokens['input_ids'], tokens['attention_mask'], labels)
        return DataLoader(dataset, batch_size=batch_size)

    train_loader = create_loader(train_df)
    test_loader = create_loader(test_df)

    optimizer = optim.AdamW(model.parameters(), lr=2e-5)
    loss_fn = nn.CrossEntropyLoss()

    model.train()
    for epoch in range(epochs):
        for input_ids, attn_mask, labels in train_loader:
            input_ids, attn_mask, labels = input_ids.to(device), attn_mask.to(device), labels.to(device)
            outputs = model(input_ids=input_ids, attention_mask=attn_mask)
            loss = loss_fn(outputs.logits, labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        print(f"Epoch {epoch+1} completed")

    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for input_ids, attn_mask, labels in test_loader:
            input_ids, attn_mask = input_ids.to(device), attn_mask.to(device)
            outputs = model(input_ids=input_ids, attention_mask=attn_mask)
            preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels.numpy())

    print(f"\n Evaluation for {model_name} (Level C)")
    print(classification_report(all_labels, all_preds, target_names=label_encoder.classes_))


In [None]:
 # For subtask c DistilBERT BERTweet ToxicBERT DeBERTa
 models = {
    "DistilBERT": "distilbert-base-uncased",
    "BERTweet": "vinai/bertweet-base",
    "ToxicBERT": "unitary/toxic-bert",
    "DeBERTa": "microsoft/deberta-v3-small"
}

print("\n====== Level C: Offense Target Classification (IND vs GRP vs OTH) ======\n")
for name, model_id in models.items():
    train_and_evaluate(model_id, train_df_c, test_df_c, le_c)


In [None]:

# Replace these with your actual predictions and labels
y_true = [0, 1, 0, 1, 1, 0, 1, 0]  # Ground truth labels
y_pred = [0, 1, 0, 0, 1, 0, 1, 1]  # Predictions by your model

# Plot the confusion matrix
cm = confusion_matrix(y_true, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["NOT", "OFF"])
disp.plot(cmap='Blues')
plt.title("Confusion Matrix - Subtask c")
plt.show()

# Model names
models = ['DistilBERT', 'BERTweet', 'ToxicBERT', 'DeBERTa']

# Replace these with your actual F1 scores for Subtask c
f1_scores = [0.79, 0.80, 0.80, 0.81]



In [None]:

# Create a bar chart
plt.figure(figsize=(8, 5))
plt.bar(models, f1_scores, color='skyblue')
plt.ylim(0, 1)
plt.xlabel("Model")
plt.ylabel("Macro F1 Score")
plt.title("F1 Score Comparison - Subtask C")
plt.grid()
plt.show()

In [None]:
#cloud
print("\nMaking a word cloud...")
all_text = " ".join(train_df['tweet'])  # Use the cleaned 'tweet' column
wordcloud = WordCloud(width=800, height=400, background_color='black', max_words=200).generate(all_text)

plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title("Word Cloud of Cleaned Tweets")
plt.show()

In [None]:
import pandas as pd

# Accuracy and macro F1 from each level (fill in based on your outputs)
results = {
    "Model": ["DistilBERT", "BERTweet", "ToxicBERT", "DeBERTa"],

    "Level A Accuracy": [0.84, 0.85, 0.86, 0.85],
    "Level A Macro F1": [0.79, 0.80, 0.80, 0.81],

    "Level B Accuracy": [0.90, 0.93, 0.88, 0.88],
    "Level B Macro F1": [0.62, 0.76, 0.65, 0.64],

    "Level C Accuracy": [0.69, 0.70, 0.68, 0.69],
    "Level C Macro F1": [0.56, 0.57, 0.53, 0.49]
}

df = pd.DataFrame(results)
df.set_index("Model", inplace=True)

# Show table
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

import pandas.plotting as pd_plotting
pd.options.display.float_format = "{:.1f}".format

df.style.background_gradient(cmap='Blues')


In [None]:
plt.figure(figsize=(10, 6))
sns.heatmap(df, annot=True, cmap="Blues", fmt=".2f", linewidths=0.5)
plt.title("Model Comparison: Accuracy and Macro F1 across Tasks A, B, C")
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()
