In [None]:
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.nn.functional as F
import torch
import random

from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from torch import optim


import torch

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from utils import HATEDataset, train, preprocess
from models import SentimentBaselineModel, VanillaLSTM

In [None]:
%load_ext autoreload
%autoreload 2


seed = 366767
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)

# Cleaning of the dataset

In [None]:
# data loading
data = pd.read_csv("labeled_data.csv")
data = data[['class', 'tweet']]
tweet = list(data['tweet'])

#Data cleaning
clean_tweet = preprocess(tweet)
data['tweet'] = clean_tweet

#Split data
train_data = data[:int(len(data)*0.8)]
test_data = data[int(len(data)*0.8):]

#Create new CSV
train_data.to_csv("train.csv", index=False)
test_data.to_csv("test.csv", index=False)


#Data imbalance
series = train_data['class'].value_counts().sort_index() / len(train_data)
train_count = torch.tensor(series).float().to(device)

# Load model and dataset

In [None]:
#Training params
device = torch.device('cuda:0') 

# We use the following pretrained tokenizer and model
model_name = "distilbert-base-uncased"
tokenizer = DistilBertTokenizer.from_pretrained(model_name)
model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=3).to(device)

train_dataset = HATEDataset("train.csv", tokenizer)
test_dataset = HATEDataset("test.csv", tokenizer)

In [None]:
batch_size = 32
epochs = 10
max_grad_norm = 1.0
warmup_percent = 0.1
learning_rate = 5e-3

train(train_dataset, test_dataset, model, device, batch_size, epochs, learning_rate, warmup_percent, max_grad_norm, train_count)