# **Machine Learning Techniques Project**


*   Nicolas Bedoya Figueroa
*   Daniel Escalante Perez
*   Marilyn Stephany Joven Fonseca
*   Eder Leandro Carbonero Baquero

## **Utils**

In [None]:
!pip install nltk pyspellchecker tqdm emoji nlpaug transformers

Collecting pyspellchecker
  Downloading pyspellchecker-0.8.2-py3-none-any.whl.metadata (9.4 kB)
Collecting emoji
  Downloading emoji-2.14.1-py3-none-any.whl.metadata (5.7 kB)
Collecting nlpaug
  Downloading nlpaug-1.1.11-py3-none-any.whl.metadata (14 kB)
Downloading pyspellchecker-0.8.2-py3-none-any.whl (7.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m46.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading emoji-2.14.1-py3-none-any.whl (590 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m590.6/590.6 kB[0m [31m28.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading nlpaug-1.1.11-py3-none-any.whl (410 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m410.5/410.5 kB[0m [31m23.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyspellchecker, emoji, nlpaug
Successfully installed emoji-2.14.1 nlpaug-1.1.11 pyspellchecker-0.8.2


In [None]:
#Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from spellchecker import SpellChecker
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from tqdm import tqdm
import nlpaug.augmenter.word as naw
from nlpaug.util import Action
import emoji
import random
import math
from nltk.corpus import wordnet
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


True

## **Data preprocessing**

### **Dataset 1: Davidson et al. 2017**

In [None]:
# Load the dataset
davidson = pd.read_csv("./data/davidson_2017.csv")[["class", "tweet"]]
davidson.head()

Unnamed: 0,class,tweet
0,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


In [None]:
# Original class labels: 0 - hate speech, 1 - offensive language, 2 - neither
# Transform the label in 1: toxic and 0: non toxic
davidson["class"] = davidson["class"].replace({0: 1, 2: 0})
davidson["class"].value_counts()

Unnamed: 0_level_0,count
class,Unnamed: 1_level_1
1,20620
0,4163


In [None]:
print("Rows:", davidson.shape[0])
print("Columns:", davidson.shape[1])

Rows: 24783
Columns: 2


### **Dataset 2: HASOC (2019) English**

In [None]:
hasoc = pd.read_csv("./data/HASOC_EN.tsv",sep = '\t')[["text","task_1"]]
hasoc.head()

Unnamed: 0,text,task_1
0,#DhoniKeepsTheGlove | WATCH: Sports Minister K...,NOT
1,@politico No. We should remember very clearly ...,HOF
2,@cricketworldcup Guess who would be the winner...,NOT
3,Corbyn is too politically intellectual for #Bo...,NOT
4,All the best to #TeamIndia for another swimmin...,NOT


In [None]:
# Transform the label to 1: toxic and 0: non toxic
hasoc["task_1"] = hasoc["task_1"].replace({"HOF": 1, "NOT": 0})
hasoc["task_1"].value_counts()

  hasoc["task_1"] = hasoc["task_1"].replace({"HOF": 1, "NOT": 0})


Unnamed: 0_level_0,count
task_1,Unnamed: 1_level_1
0,3591
1,2261


In [None]:
# Change column names to match the other datasets
hasoc = hasoc.rename(columns={'task_1': 'class', 'text': 'tweet'})
hasoc.head()

Unnamed: 0,tweet,class
0,#DhoniKeepsTheGlove | WATCH: Sports Minister K...,0
1,@politico No. We should remember very clearly ...,1
2,@cricketworldcup Guess who would be the winner...,0
3,Corbyn is too politically intellectual for #Bo...,0
4,All the best to #TeamIndia for another swimmin...,0


In [None]:
print("Rows:", hasoc.shape[0])
print("Columns:", hasoc.shape[1])

Rows: 5852
Columns: 2


### **Dataset 3: Zeerak Talat’s Hate Speech Dataset**

In [None]:
# Load the dataset
zeerak = pd.read_csv("./data/NAACL_SRW_2016_fixed.csv")[["class", "text"]]
zeerak.head()

Unnamed: 0,class,text
0,racism,So Drasko just said he was impressed the girls...
1,racism,Drasko they didn't cook half a bird you idiot ...
2,racism,Hopefully someone cooks Drasko in the next ep ...
3,racism,of course you were born in serbia...you're as ...
4,racism,RT @YesYoureRacist: At least you're only a tin...


In [None]:
# Original class labels
zeerak["class"].value_counts()

Unnamed: 0_level_0,count
class,Unnamed: 1_level_1
none,7060
sexism,2577
racism,11


In [None]:
# Transform the label to 1: toxic and 0: non toxic
zeerak["class"] = zeerak["class"].replace({"sexism": 1, "racism": 1, "none": 0})
zeerak["class"].value_counts()

  zeerak["class"] = zeerak["class"].replace({"sexism": 1, "racism": 1, "none": 0})


Unnamed: 0_level_0,count
class,Unnamed: 1_level_1
0,7060
1,2588


In [None]:
# Rename the columns to match the other datasets
zeerak = zeerak.rename(columns={'text': 'tweet'})
zeerak.head()

Unnamed: 0,class,tweet
0,1,So Drasko just said he was impressed the girls...
1,1,Drasko they didn't cook half a bird you idiot ...
2,1,Hopefully someone cooks Drasko in the next ep ...
3,1,of course you were born in serbia...you're as ...
4,1,RT @YesYoureRacist: At least you're only a tin...


In [None]:
print("Rows:", zeerak.shape[0])
print("Columns:", zeerak.shape[1])

Rows: 9648
Columns: 2


### **Concatenation**

In [None]:
data = pd.concat([davidson, hasoc, zeerak], axis=0, ignore_index=True)
data.head()

Unnamed: 0,class,tweet
0,0,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


In [None]:
data["class"].value_counts()

Unnamed: 0_level_0,count
class,Unnamed: 1_level_1
1,25469
0,14814


In [None]:
print("Rows:", data.shape[0])
print("Columns:", data.shape[1])

Rows: 40283
Columns: 2


### **Cleaning**

In [None]:
#Check point
data_cleaning = data.copy()

In [None]:
# Transform emojis into words

def emoji_to_words(text):
  return emoji.demojize(text, language='en')

data_cleaning['tweet'] = data_cleaning['tweet'].apply(emoji_to_words)

In [None]:
# Remove URLs from tweets

def remove_urls(text):
  return re.sub(r'http\S+', '', text)

data_cleaning['tweet'] = data_cleaning['tweet'].apply(remove_urls)

In [None]:
# Remove mentions from tweets
def remove_mentions(text):
  return re.sub(r'@\w+', '', text)

data_cleaning['tweet'] = data_cleaning['tweet'].apply(remove_mentions)


In [None]:
# Remove symbols from tweets

def leave_letters(text):
  return re.sub(r'[^a-zA-Z]', ' ', text)

data_cleaning['tweet'] = data_cleaning['tweet'].apply(leave_letters)

In [None]:
# Remove symbols from tweets

def lowercase(text):
  return text.lower()

data_cleaning['tweet'] = data_cleaning['tweet'].apply(lowercase)

In [None]:
# Correct spelling
spell = SpellChecker()

def correct_spelling(text):
  words = text.split()
  corrected_words = [spell.correction(word) or word for word in words]
  return ' '.join(corrected_words)

data_cleaning['tweet'] = [correct_spelling(text) for text in tqdm(data_cleaning['tweet'])]

In [None]:
# Remove stopwords

def remove_stopwords(text):
  stop_words = set(stopwords.words('english'))  # Use English stop words
  words = text.split()
  filtered_words = [word for word in words if word not in stop_words]
  return " ".join(filtered_words)

data_cleaning['tweet'] = data_cleaning['tweet'].apply(remove_stopwords)

In [None]:
# Stemming the words

stemmer = PorterStemmer()

def stem_text(text):
  words = text.split()
  stemmed_words = [stemmer.stem(word) for word in words]
  return " ".join(stemmed_words)

  data_cleaning['tweet'] = data_cleaning['tweet'].apply(stem_text)


In [None]:
data_cleaning['tweet'].head(200)

Unnamed: 0,tweet
0,rt woman complain cleaning house amp man alway...
1,rt boy dats cold tyga dwn bad cuffin dat hoe s...
2,rt dawg rt ever fuck bitch start cry confused ...
3,rt look like tranny
4,rt shit hear might true might faker bitch told ya
...,...
195,tired bitches saying look mean nigga big af we...
196,birds grandkids may never see thanks climate c...
197,stay beautiful bitch
198,wutkinda r purple ceeeleee man gurl jus playin...


In [None]:
# Check and remove duplicates
print(f'Duplicates: {data_cleaning["tweet"].duplicated().sum()}')
duplicated_tweets = data_cleaning["tweet"].duplicated()
data_cleaning = data_cleaning[~duplicated_tweets]

Duplicates: 1294


In [None]:
# Check for null or empty again if they appeared due to augmentation

keep = ~((data_cleaning["tweet"].isnull()) | (data_cleaning["tweet"] == ""))

print(f'Number of nulls or empty: {(~keep).sum()}')

data_cleaning = data_cleaning[keep]

print(f'Cleaned data shape: {data_cleaning.shape}')

Number of nulls or empty: 0
Cleaned data shape: (38988, 2)


In [None]:
# Checking the dataset's balance

data_cleaning["class"].value_counts()

Unnamed: 0_level_0,count
class,Unnamed: 1_level_1
1,24667
0,14321


In [None]:
# Function to get a random synonym of a word

def get_synonym(word):
  synonyms = []
  for syn in wordnet.synsets(word):
    for lemma in syn.lemmas():
      synonyms.append(lemma.name())
  if len(synonyms) > 0:
    synonyms = list(set(synonyms))
    return synonyms[random.randint(0, len(synonyms) - 1)]
  else:
    return ""

In [None]:
# Custom random insertion function

def random_synonym_insert_augment(text, alpha):

  words = text.split()
  new_text = words.copy()
  for word in words:
    if random.random() < alpha:
      synonym = get_synonym(word)
      if synonym != "":
        position = random.randint(0, len(new_text) - 1)
        new_text.insert(position, synonym)

  return [" ".join(new_text)]

In [None]:
# Balance the data set using easy data augmentation

#Choose an alpha parameter (Percentage of words in a sentence that are changed)
alpha = 0.25

aug_synonym = naw.SynonymAug(aug_src='wordnet', aug_p = alpha)
aug_swap = naw.RandomWordAug(action="swap", aug_p = alpha)
aug_delete = naw.RandomWordAug(action="delete", aug_p = alpha)


# Store the new rows
new_rows = []

# Size to balance the classes
desired_size = len(data_cleaning[data_cleaning['class'] == 1]) - len(data_cleaning[data_cleaning['class'] == 0])

non_toxic = data_cleaning[data_cleaning['class'] == 0]

# Until balanced
while len(new_rows) < desired_size:
  # Get a random sample from the minority class
  random_row = non_toxic.sample(1)

  # Pick a random EDA technique and apply it
  random_num = random.randint(1, 4)
  augmented_text = []

  if random_num == 1:
    augmented_text = aug_synonym.augment(random_row['tweet'].values[0])
  elif random_num == 2:
    augmented_text = aug_swap.augment(random_row['tweet'].values[0])
  elif random_num == 3:
    augmented_text = random_synonym_insert_augment(random_row['tweet'].values[0], alpha)
  else:
    augmented_text = aug_delete.augment(random_row['tweet'].values[0])

  if len(augmented_text) > 0:
    new_rows.append({ "tweet": augmented_text[0], "class": random_row['class'].values[0] })


# New rows dataframe
new_rows_df = pd.DataFrame(new_rows)

# Concatenate the datasets
balanced_data = pd.concat([data_cleaning, new_rows_df], ignore_index = True, axis = 0)

print(f'Balanced data shape: {balanced_data.shape}')
print()
balanced_data["class"].value_counts()


Balanced data shape: (49334, 2)



Unnamed: 0_level_0,count
class,Unnamed: 1_level_1
0,24667
1,24667


In [None]:
# Check and remove duplicates again that could have appeared due to augmentation
print(f'Duplicates: {balanced_data["tweet"].duplicated().sum()}')
duplicated_tweets = balanced_data["tweet"].duplicated()
balanced_data = balanced_data[~duplicated_tweets]

Duplicates: 1204


In [None]:
# Check for null or empty again if they appeared due to augmentation

keep = ~((balanced_data["tweet"].isnull()) | (balanced_data["tweet"] == ""))

print(f'Number of nulls or empty: {(~keep).sum()}')

balanced_data = balanced_data[keep]

print(f'Balanced data shape: {balanced_data.shape}')

Number of nulls or empty: 0
Balanced data shape: (48130, 2)


In [None]:
# Final distribution

balanced_data["class"].value_counts()

Unnamed: 0_level_0,count
class,Unnamed: 1_level_1
1,24667
0,23463


In [None]:
balanced_data.tail(100)

Unnamed: 0,class,tweet
49221,0,apparently walking catwalk involves putting on...
49222,0,another week let bojo canvass borisjohnson lov...
49223,0,birds lol
49224,0,straight sociopath twitter acct liquidator gui...
49226,0,still still unacceptable
...,...,...
49329,0,time nevertheless uranium wake realize time si...
49330,0,used tech evangelist idea press works thanks e...
49331,0,cat kunt mkr
49332,0,fire racists hire qualify someone qualified em...


In [None]:
balanced_data.to_csv('balanced_data.csv', index=False)