# Data Collection

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('../data/labeled_data.csv')
df.dropna(inplace=True)
 
df.head(2)

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...


In [3]:
df = df[['tweet', 'class']]

## Pre-Processing

In [4]:
import re
import nltk
from html import unescape
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize


In [5]:
# Initialize necessary tools
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\bemne\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\bemne\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\bemne\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
def preprocess_tweet(tweet):
    tweet = unescape(tweet)  # Unescape HTML entities
    tweet = re.sub(r'\bRT\b', '', tweet)  # Remove "RT" markers
    tweet = re.sub(r'@\w+', '', tweet)  # Remove mentions
    tweet = re.sub(r'http\S+', '', tweet)  # Remove URLs
    tweet = re.sub(r'[^A-Za-z\s]', '', tweet)  # Remove special characters
    tweet = re.sub(r'\d+', '', tweet)  # Remove numeric digits
    tweet = tweet.lower()  # Lowercase
    
    tokens = word_tokenize(tweet)  # Tokenize
    tokens = [word for word in tokens if word not in stop_words]  # Remove stopwords
    tokens = [lemmatizer.lemmatize(word) for word in tokens]  # Lemmatize
    tweet = ' '.join(tokens)  # Join tokens back into a single string
    return tweet, tokens


In [7]:
df['tweet'], df['tokens'] = zip(*df['tweet'].apply(preprocess_tweet))
df.loc[:, 'label'] = df['class'].apply(lambda x: 1 if x == 0 else 0)  # Create label column
df = df.drop(columns=['class'])
df.head()

Unnamed: 0,tweet,tokens,label
0,woman shouldnt complain cleaning house man alw...,"[woman, shouldnt, complain, cleaning, house, m...",0
1,boy dat coldtyga dwn bad cuffin dat hoe st place,"[boy, dat, coldtyga, dwn, bad, cuffin, dat, ho...",0
2,dawg ever fuck bitch start cry confused shit,"[dawg, ever, fuck, bitch, start, cry, confused...",0
3,look like tranny,"[look, like, tranny]",0
4,shit hear might true might faker bitch told ya,"[shit, hear, might, true, might, faker, bitch,...",0


In [8]:
## save to csv
df.to_csv('../preprocessed/tweets_general.csv', index=False)