# Data Collection

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('../data/labeled_data.csv')
df.dropna(inplace=True)
 
df.head(2)

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...


## Pre-Processing

#### Preprocess for BERT

In [3]:
from transformers import BertTokenizer
import re
from html import unescape

In [4]:
def preprocess_for_bert(tweet):
    tweet = unescape(tweet)  # Unescape HTML entities
    tweet = re.sub(r'@\w+', '', tweet)  # Remove mentions
    tweet = re.sub(r'http\S+', '', tweet)  # Remove URLs
    tweet = re.sub(r'(RT|rt)', '', tweet)  # Remove "RT" marker (case-insensitive)
    tweet = tweet.lower()  # Lowercasing if using uncased model
    tweet = re.sub(r'[^A-Za-z0-9\s]', '', tweet)  # Remove special characters
    tweet = re.sub(r'\s+', ' ', tweet).strip()  # Remove additional whitespace
    return tweet

print(df['tweet'][0])
preprocess_for_bert(df['tweet'][0])

!!! RT @mayasolovely: As a woman you shouldn't complain about cleaning up your house. &amp; as a man you should always take the trash out...


'as a woman you shouldnt complain about cleaning up your house as a man you should always take the trash out'

In [5]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
df['label'] = df['class'].apply(lambda x: 1 if x == 0 else 0) ## create label column
df['tweet'] = df['tweet'].apply(preprocess_for_bert)
# Tokenize using BERT tokenizer
df['tokens'] = df['tweet'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True))

In [6]:
df = df[['tweet', 'label', 'tokens']]
df.head()

Unnamed: 0,tweet,label,tokens
0,as a woman you shouldnt complain about cleanin...,0,"[101, 2004, 1037, 2450, 2017, 5807, 2102, 1761..."
1,boy dats coldtyga dwn bad for cuffin dat hoe i...,0,"[101, 2879, 23755, 2015, 3147, 3723, 3654, 104..."
2,dawg you ever fuck a bitch and she sta to cry ...,0,"[101, 4830, 27767, 2017, 2412, 6616, 1037, 774..."
3,she look like a tranny,0,"[101, 2016, 2298, 2066, 1037, 25283, 4890, 102]"
4,the shit you hear about me might be true or it...,0,"[101, 1996, 4485, 2017, 2963, 2055, 2033, 2453..."


In [11]:
# Write the data frame into txt file
with open('../preprocessed/tweets_bert.txt', 'w', encoding='utf-8') as f:
    for title, body, tokens in zip(df.label.values, df.tweet.values, df.tokens.values):
        f.write(str(title) + '\n')
        f.write(body + '\n')
        f.write(str(tokens) + '\n')
    

In [9]:
## pickled format
import pickle

with open('../preprocessed/tweets_bert.pkl', 'wb') as f:
    pickle.dump(df, f)

In [7]:
# Write the data frame into txt file
with open('../preprocessed/tweets_as_text.txt', 'w', encoding='utf-8') as f:
    for label, body, _ in zip(df.label.values, df.tweet.values, df.tokens.values):
        f.write(body + '\n')