# Imports + GPU Setup

In [14]:
import os

os.environ["WANDB_DISABLED"] = "true"
os.environ["CUDA_VISIBLE_DEVICES"] = "2"
os.environ["TOKENIZERS_PARALLELISM"] = "false"


import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import logging
import re
import nltk


from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from copy import deepcopy
from urllib import request
from dont_patronize_me import DontPatronizeMe # data manager module
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from torch.optim import lr_scheduler
import torch.nn.functional as F
import torch.optim as optim

from transformers import RobertaModel, RobertaTokenizer
from simpletransformers.classification import ClassificationModel, ClassificationArgs

from preprocessing import load_data, preprocess_data, DPMDataset

logging.basicConfig(level=logging.ERROR)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'device: {device}')



device: cuda


In [3]:
import torch
print(torch.cuda.is_available())
print(torch.cuda.device_count())
print(torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU detected")

True
1
Tesla V100S-PCIE-32GB


# Data Setup

Retrieves the data, applies the specified train and test split to organise data into **train_df** and **dev_df**.

In [4]:
train_df, dev_df, test_df = load_data()

# downsample negative instances
pcldf = train_df[train_df.label==1]
npos = len(pcldf)
balanced_train_df = pd.concat([pcldf, train_df[train_df.label==0][:int(2.5*npos)]])
balanced_train_df = balanced_train_df[['text', 'community', 'label', 'country']]

# Dataset

In [5]:
processed_train_df = preprocess_data(balanced_train_df, clean_data=False, augment_data=True, add_country=False, add_community=True)
processed_dev_df = preprocess_data(dev_df, clean_data=False, add_country=False, add_community=True)
processed_test_df = preprocess_data(test_df, clean_data=False, add_country=False, add_community=True)

# TRAINING

In [9]:
def set_seed(i):
    torch.manual_seed(i)
    np.random.seed(i)

In [None]:

scores = []

for i in range(5):
    set_seed(i)

    task1_model_args = ClassificationArgs(
        num_train_epochs=5,
        no_save=False,          # Allows saving
        no_cache=True,
        overwrite_output_dir=True,
        train_batch_size=16,
        learning_rate=2e-5,
        output_dir='saved_model',
        save_model_every_epoch=False,
        save_steps=-1,          # Disable intermediate saves
        use_multiprocessing= False, 
        use_multiprocessing_for_evaluation=False,
        process_count= 1
    )


    # Create the model
    model = ClassificationModel(
        "roberta",
        "roberta-base",
        args=task1_model_args,
        num_labels=2,  # For binary classification
        use_cuda=True,
    )

    lr = 1e-5
    optimizer = optim.AdamW(model.model.parameters(), lr, weight_decay=0.01)
    lrs = lr_scheduler.CosineAnnealingLR(optimizer, T_max=10)
    model.optimizer = optimizer
    model.lr_scheduler = lrs
    
    # Train with class weights
    model.train_model(
        processed_train_df[["text", "label"]]
    )

    # Predict on test data (tedf)
    preds_task1, _ = model.predict(processed_dev_df["text"].tolist())

    # Compute F1 score on tedf
    f1 = f1_score(processed_dev_df["label"], preds_task1)
    scores.append(f1)
    print("F1 score:", f1)

print(scores)
print(np.mean(scores))
print(np.std(scores))

Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

  scaler = amp.GradScaler()


Running Epoch 1 of 5:   0%|          | 0/447 [00:00<?, ?it/s]

  with amp.autocast():


RuntimeError: value cannot be converted to type at::Half without overflow