# Libraries

In [12]:
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import nltk

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.model_selection import train_test_split
from transformers import GPT2Config, GPT2Tokenizer, GPT2ForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset
from torch.utils.data import Dataset as TorchDataset
from sklearn.metrics import accuracy_score

# Preprocessing

In [13]:
# count: number of CrowdFlower users who coded each tweet (min is 3, sometimes more users coded a tweet when 
# hate_speech: number of CF users who judged the tweet to be hate speech
# offensive_language: number of CF users who judged the tweet to be offensive
# neither: number of CF users who judged the tweet to be neither offensive nor non-offensive
# class: class label for majority of CF users. 0 - hate speech 1 - offensive language 2 - neither
# tweet: text tweet

def parse_tweets_until_colon(data):
    parsed_tweets = []
    for tweet in data['tweet']:
        colon_index = tweet.find(':')
        if colon_index != -1:
            parsed_tweets.append(tweet[colon_index + 1:])
        else:
            parsed_tweets.append(tweet)
    return parsed_tweets

data = pd.read_csv('labeled_data.csv')
data = data.drop(columns='Unnamed: 0')
data['tweet'] = parse_tweets_until_colon(data)

data.head()

Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,3,0,0,3,2,As a woman you shouldn't complain about clean...
1,3,0,3,0,1,boy dats cold...tyga dwn bad for cuffin dat h...
2,3,0,3,0,1,You ever fuck a bitch and she start to cry? Y...
3,3,0,2,1,1,@viva_based she look like a tranny
4,6,0,6,0,1,The shit you hear about me might be true or i...


# Text cleaning

In [14]:
# Text Cleaning:
# Lowercasing: Convert all text to lowercase to ensure uniformity.
# Removing Punctuation: Eliminate punctuation marks as they often don't carry much meaning in NLP tasks.
# Removing Special Characters: Remove special characters, emojis, URLs, etc., which may not contribute to the task at hand.
# Removing Stopwords: Stopwords are common words (e.g., "the", "is", "and") that occur frequently but often carry little information. Removing them can reduce noise in the data.

nltk.download("stopwords")
nltk.download("punkt")
nltk.download("wordnet")

def clean_text(data):
    cleaned_tweets = []
    for tweet in data["tweet"]:
        tweet = tweet.lower()  # Convert text to lowercase
        tweet = re.sub(r"[^\w\s]", "", tweet)  # Remove punctuation
        tweet = re.sub(r"\d+", "", tweet)  # Remove numbers
        tweet = re.sub(r"\s+", " ", tweet).strip()  # Remove extra whitespaces
        cleaned_tweets.append(tweet)
    return cleaned_tweets

def remove_stopwords(data):
    nostopwords_tweets = []
    for tweet in data["tweet"]:
        tokens = word_tokenize(tweet)  # Tokenize text
        stop_words = set(stopwords.words("english"))
        filtered_tokens = [word for word in tokens if word not in stop_words]  # Remove stopwords
        filtered_text = " ".join(filtered_tokens)  # Join tokens back into a string
        nostopwords_tweets.append(filtered_text)
    return nostopwords_tweets

def apply_stemming(data):
    stemmer = PorterStemmer()
    stemmed_tweets = []
    for tweet in data["tweet"]:
        tokens = word_tokenize(tweet)  # Tokenize text
        stemmed_tokens = [stemmer.stem(token) for token in tokens]  # Apply stemming
        stemmed_text = " ".join(stemmed_tokens)  # Join tokens back into a string
        stemmed_tweets.append(stemmed_text)
    return stemmed_tweets

def apply_lemmatization(data):
    lemmatizer = WordNetLemmatizer()
    lemmatized_tweets = []
    for tweet in data["tweet"]:
        tokens = word_tokenize(tweet)  # Tokenize text
        lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]  # Apply lemmatization
        lemmatized_text = " ".join(lemmatized_tokens)  # Join tokens back into a string
        lemmatized_tweets.append(lemmatized_text)
    return lemmatized_tweets

data["tweet"] = clean_text(data)
data["tweet"] = remove_stopwords(data)
data["tweet"] = apply_stemming(data)
data["tweet"] = apply_lemmatization(data)

data.head()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Eddy\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Eddy\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Eddy\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,3,0,0,3,2,woman shouldnt complain clean hous amp man alw...
1,3,0,3,0,1,boy dat coldtyga dwn bad cuffin dat hoe st place
2,3,0,3,0,1,ever fuck bitch start cri confus shit
3,3,0,2,1,1,viva_bas look like tranni
4,6,0,6,0,1,shit hear might true might faker bitch told ya


## Split training/test set

In [15]:
X_train, X_test, y_train, y_test = train_test_split(data["tweet"], data["class"], test_size=0.2, random_state=1)

# Creating the model

In [17]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using {device} for training")

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token  # Set padding token to eos_token

max_length = 128
tokenized_texts_train = tokenizer(list(X_train), padding=True, truncation=True, max_length=max_length, return_tensors='pt')
tokenized_texts_test = tokenizer(list(X_test), padding=True, truncation=True, max_length=max_length, return_tensors='pt')

labels_train = torch.tensor(list(y_train)).to(device)
labels_test = torch.tensor(list(y_test)).to(device)

train_dataset = TensorDataset(tokenized_texts_train.input_ids, tokenized_texts_train.attention_mask, labels_train)
test_dataset = TensorDataset(tokenized_texts_test.input_ids, tokenized_texts_test.attention_mask, labels_test)

batch_size = 16

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

model_config = GPT2Config.from_pretrained('gpt2', num_labels=3, pad_token_id=tokenizer.pad_token_id)
model = GPT2ForSequenceClassification(model_config).to(device)

optimizer = AdamW(model.parameters(), lr=5e-5, no_deprecation_warning=True)

epochs = 5

for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    print(f'Epoch {epoch + 1}: Average Loss: {total_loss / len(train_loader)}')

model.eval()
predicted_labels = []
true_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predicted_labels.extend(torch.argmax(logits, axis=1).tolist())
        true_labels.extend(labels.tolist())

accuracy = accuracy_score(true_labels, predicted_labels)
print(f'Accuracy: {accuracy}')

Using cuda for training
Epoch 1: Average Loss: 0.4117244767881329
Epoch 2: Average Loss: 0.28059977674255926
Epoch 3: Average Loss: 0.20514537520639056
Epoch 4: Average Loss: 0.14681136520924948
Epoch 5: Average Loss: 0.09591399780505115
Accuracy: 0.8694775065563849
