================================================================================
### Prepare tokenized dataset for BERT / RoBERTa fine-tuning
================================================================================

In [11]:
import os
import re
import random
import numpy as np
import pandas as pd
import torch

from datasets import Dataset
from transformers import AutoTokenizer
from sklearn.utils.class_weight import compute_class_weight
import matplotlib.pyplot as plt
import seaborn as sns


# Seed
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(42)

# path
path = "C:/Users/diego/Desktop/multimodal-argmining"


# Model
MODEL_NAME = "roberta-base" 

In [12]:

# Load Dataset
train_path = f"{path}/data/gun_control_train.csv"
dev_path   = f"{path}/data/gun_control_dev.csv"

train_df = pd.read_csv(train_path)
print(f"Train dataset loaded: {train_df.shape}")


train_df.head()

Train dataset loaded: (923, 6)


Unnamed: 0,tweet_id,tweet_url,tweet_text,stance,persuasiveness,split
0,1372936384034447366,https://t.co/FpkVZ8ESy0,More Asian-Americans Are Buying Guns For Prote...,oppose,no,train
1,1327310308260667393,https://t.co/KrJTpJR3Ke,"""I will protect your Bill of Rights. Gun contr...",oppose,no,train
2,1334523148692312065,https://t.co/hBAV1pPCY9,#guns #2A 6-Time Olympic Shooting Medalist Say...,oppose,no,train
3,1324087921641721856,https://t.co/LfIzR6iPA3,Congratulations @ForHD65 on your victory! \n\n...,support,no,train
4,1313162243035607040,https://t.co/MZyeIP6Mtx,Dr. Cindy Banyai supports common sense gun saf...,support,no,train


In [14]:
#Label Mapping & Basic Filtering

#mapping
label2id = {"oppose": 0, "support": 1}
id2label = {v: k for k, v in label2id.items()}

#labels
train_df = train_df[train_df["stance"].isin(label2id.keys())].copy()
train_df["label"] = train_df["stance"].map(label2id)

print(train_df["label"].value_counts())

label
1    475
0    448
Name: count, dtype: int64


In [15]:
# Minimal Text Cleaning

# We apply only minimal preprocessing
# Heavy cleaning like stemming or stopword removal is not adviced before tokenization.

def minimal_clean(text):
    text = str(text)
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)   
    text = re.sub(r"@\w+", "", text)                      
    text = re.sub(r"\s+", " ", text).strip()              
    return text

train_df["tweet_text"] = train_df["tweet_text"].apply(minimal_clean)
train_df["tweet_text"].head()


0    More Asian-Americans Are Buying Guns For Prote...
1    "I will protect your Bill of Rights. Gun contr...
2    #guns #2A 6-Time Olympic Shooting Medalist Say...
3    Congratulations on your victory! We’re proud t...
4    Dr. Cindy Banyai supports common sense gun saf...
Name: tweet_text, dtype: object

In [16]:
# Load Tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
print(f"Tokenizer loaded: {MODEL_NAME}")

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Tokenizer loaded: roberta-base
