In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import nltk
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from imblearn.over_sampling import SMOTE

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
# Kaggle Dataset - https://www.kaggle.com/datasets/mrmorj/hate-speech-and-offensive-language-dataset/data

df = pd.read_csv('/content/labeled_data.csv')

In [None]:
df.head(15)

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...
5,5,3,1,2,0,1,"!!!!!!!!!!!!!!!!!!""@T_Madison_x: The shit just..."
6,6,3,0,3,0,1,"!!!!!!""@__BrighterDays: I can not just sit up ..."
7,7,3,0,3,0,1,!!!!&#8220;@selfiequeenbri: cause I'm tired of...
8,8,3,0,3,0,1,""" &amp; you might not get ya bitch back &amp; ..."
9,9,3,1,2,0,1,""" @rhythmixx_ :hobbies include: fighting Maria..."


In [None]:
display(df['class'].unique())
df.shape

array([2, 1, 0])

(24783, 7)

In [None]:
df.columns

Index(['Unnamed: 0', 'count', 'hate_speech', 'offensive_language', 'neither',
       'class', 'tweet'],
      dtype='object')

In [None]:
df.drop(columns = ['Unnamed: 0', 'count', 'hate_speech', 'offensive_language', 'neither'], inplace = True)

In [None]:
df

Unnamed: 0,class,tweet
0,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...
...,...,...
24778,1,you's a muthaf***in lie &#8220;@LifeAsKing: @2...
24779,2,"you've gone and broke the wrong heart baby, an..."
24780,1,young buck wanna eat!!.. dat nigguh like I ain...
24781,1,youu got wild bitches tellin you lies


In [None]:
df.isnull().sum()

Unnamed: 0,0
class,0
tweet,0


In [None]:
df['tweet'].iloc[0]

"!!! RT @mayasolovely: As a woman you shouldn't complain about cleaning up your house. &amp; as a man you should always take the trash out..."

In [None]:
df['tweet'].iloc[1000]

'&#128514;&#128514;&#128514;&#128514;&#128514;&#128514;&#128514;"@betysweetcocker: That pussy is just....&#128561; imma assume she just had a baby like..the day before"'

In [None]:
df['Processed_tweet'] = df['tweet'].str.replace(r'[^a-zA-Z]', ' ', regex = True)

In [None]:
df['Processed_tweet'].iloc[0]

'    RT  mayasolovely  As a woman you shouldn t complain about cleaning up your house   amp  as a man you should always take the trash out   '

In [None]:
df['Processed_tweet'].iloc[1000]

'                                                                 betysweetcocker  That pussy is just              imma assume she just had a baby like  the day before '

In [None]:
df['Processed_tweet_latest'] = df['Processed_tweet'].str.replace(r'[\s]+', ' ', regex = True)

In [None]:
df['Processed_tweet_latest'].iloc[0]

' RT mayasolovely As a woman you shouldn t complain about cleaning up your house amp as a man you should always take the trash out '

In [None]:
df['Processed_tweet_latest'].iloc[1000]

' betysweetcocker That pussy is just imma assume she just had a baby like the day before '

In [None]:
df

Unnamed: 0,class,tweet,Processed_tweet,Processed_tweet_latest
0,2,!!! RT @mayasolovely: As a woman you shouldn't...,RT mayasolovely As a woman you shouldn t...,RT mayasolovely As a woman you shouldn t comp...
1,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,RT mleew boy dats cold tyga dwn ba...,RT mleew boy dats cold tyga dwn bad for cuffi...
2,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,RT UrKindOfBrand Dawg RT sbaby...,RT UrKindOfBrand Dawg RT sbaby life You ever ...
3,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,RT C G Anderson viva based she lo...,RT C G Anderson viva based she look like a tr...
4,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,RT ShenikaRoberts The shit you...,RT ShenikaRoberts The shit you hear about me ...
...,...,...,...,...
24778,1,you's a muthaf***in lie &#8220;@LifeAsKing: @2...,you s a muthaf in lie LifeAsKing ...,you s a muthaf in lie LifeAsKing Pearls corey ...
24779,2,"you've gone and broke the wrong heart baby, an...",you ve gone and broke the wrong heart baby an...,you ve gone and broke the wrong heart baby and...
24780,1,young buck wanna eat!!.. dat nigguh like I ain...,young buck wanna eat dat nigguh like I ain...,young buck wanna eat dat nigguh like I aint fu...
24781,1,youu got wild bitches tellin you lies,youu got wild bitches tellin you lies,youu got wild bitches tellin you lies


In [None]:
df.drop(columns = ['tweet', 'Processed_tweet'], inplace = True)

In [None]:
df

Unnamed: 0,class,Processed_tweet_latest
0,2,RT mayasolovely As a woman you shouldn t comp...
1,1,RT mleew boy dats cold tyga dwn bad for cuffi...
2,1,RT UrKindOfBrand Dawg RT sbaby life You ever ...
3,1,RT C G Anderson viva based she look like a tr...
4,1,RT ShenikaRoberts The shit you hear about me ...
...,...,...
24778,1,you s a muthaf in lie LifeAsKing Pearls corey ...
24779,2,you ve gone and broke the wrong heart baby and...
24780,1,young buck wanna eat dat nigguh like I aint fu...
24781,1,youu got wild bitches tellin you lies


In [None]:
def lemmatize(text):
  lemmatizer = WordNetLemmatizer()
  tokens = word_tokenize(text)
  lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
  return ' '.join(lemmatized_tokens)

In [None]:
df['Lemmatized_tweet'] = df['Processed_tweet_latest'].apply(lemmatize)

In [None]:
df

Unnamed: 0,class,Processed_tweet_latest,Lemmatized_tweet
0,2,RT mayasolovely As a woman you shouldn t comp...,RT mayasolovely As a woman you shouldn t compl...
1,1,RT mleew boy dats cold tyga dwn bad for cuffi...,RT mleew boy dat cold tyga dwn bad for cuffin ...
2,1,RT UrKindOfBrand Dawg RT sbaby life You ever ...,RT UrKindOfBrand Dawg RT sbaby life You ever f...
3,1,RT C G Anderson viva based she look like a tr...,RT C G Anderson viva based she look like a tranny
4,1,RT ShenikaRoberts The shit you hear about me ...,RT ShenikaRoberts The shit you hear about me m...
...,...,...,...
24778,1,you s a muthaf in lie LifeAsKing Pearls corey ...,you s a muthaf in lie LifeAsKing Pearls corey ...
24779,2,you ve gone and broke the wrong heart baby and...,you ve gone and broke the wrong heart baby and...
24780,1,young buck wanna eat dat nigguh like I aint fu...,young buck wan na eat dat nigguh like I aint f...
24781,1,youu got wild bitches tellin you lies,youu got wild bitch tellin you lie


In [None]:
stop_words = set(stopwords.words('english'))

In [None]:
def remove_stopwords(text):
  tokens = word_tokenize(text)
  filtered_tokens = []
  for token in tokens:
    if token.lower() not in stop_words:
      filtered_tokens.append(token)
  return ' '.join(filtered_tokens)

In [None]:
df['Final_tweet'] = df['Lemmatized_tweet'].apply(remove_stopwords)

In [None]:
df

Unnamed: 0,class,Processed_tweet_latest,Lemmatized_tweet,Final_tweet
0,2,RT mayasolovely As a woman you shouldn t comp...,RT mayasolovely As a woman you shouldn t compl...,RT mayasolovely woman complain cleaning house ...
1,1,RT mleew boy dats cold tyga dwn bad for cuffi...,RT mleew boy dat cold tyga dwn bad for cuffin ...,RT mleew boy dat cold tyga dwn bad cuffin dat ...
2,1,RT UrKindOfBrand Dawg RT sbaby life You ever ...,RT UrKindOfBrand Dawg RT sbaby life You ever f...,RT UrKindOfBrand Dawg RT sbaby life ever fuck ...
3,1,RT C G Anderson viva based she look like a tr...,RT C G Anderson viva based she look like a tranny,RT C G Anderson viva based look like tranny
4,1,RT ShenikaRoberts The shit you hear about me ...,RT ShenikaRoberts The shit you hear about me m...,RT ShenikaRoberts shit hear might true might f...
...,...,...,...,...
24778,1,you s a muthaf in lie LifeAsKing Pearls corey ...,you s a muthaf in lie LifeAsKing Pearls corey ...,muthaf lie LifeAsKing Pearls corey emanuel rig...
24779,2,you ve gone and broke the wrong heart baby and...,you ve gone and broke the wrong heart baby and...,gone broke wrong heart baby drove redneck crazy
24780,1,young buck wanna eat dat nigguh like I aint fu...,young buck wan na eat dat nigguh like I aint f...,young buck wan na eat dat nigguh like aint fuc...
24781,1,youu got wild bitches tellin you lies,youu got wild bitch tellin you lie,youu got wild bitch tellin lie


In [None]:
vocab_size = 10000
onehot_tweet = []
for text in df['Final_tweet']:
    one_hot_representation = one_hot(text, vocab_size)
    onehot_tweet.append(one_hot_representation)

In [None]:
onehot_tweet[0]

[1167, 3798, 8684, 1710, 7215, 3532, 844, 9849, 230, 1683, 3915]

In [None]:
for i in range(0, 5):
  print(onehot_tweet[i])

[1167, 3798, 8684, 1710, 7215, 3532, 844, 9849, 230, 1683, 3915]
[1167, 5982, 6994, 4264, 1291, 8154, 6920, 8848, 4790, 4264, 713, 256, 8164]
[1167, 4968, 1803, 1167, 2619, 6140, 1789, 9126, 4138, 5853, 98, 3553, 1477]
[1167, 251, 5463, 1722, 1304, 7895, 1706, 8531, 6088]
[1167, 9261, 1477, 4981, 3436, 696, 3436, 6202, 4138, 3485, 5441]


In [None]:
sentence_length = 20
padded_tweet = pad_sequences(onehot_tweet, padding = 'pre', maxlen = sentence_length)

In [None]:
for i in range(0, 5):
  print(padded_tweet[i])

[   0    0    0    0    0    0    0    0    0 1167 3798 8684 1710 7215
 3532  844 9849  230 1683 3915]
[   0    0    0    0    0    0    0 1167 5982 6994 4264 1291 8154 6920
 8848 4790 4264  713  256 8164]
[   0    0    0    0    0    0    0 1167 4968 1803 1167 2619 6140 1789
 9126 4138 5853   98 3553 1477]
[   0    0    0    0    0    0    0    0    0    0    0 1167  251 5463
 1722 1304 7895 1706 8531 6088]
[   0    0    0    0    0    0    0    0    0 1167 9261 1477 4981 3436
  696 3436 6202 4138 3485 5441]


In [None]:
df['class'].value_counts()

Unnamed: 0_level_0,count
class,Unnamed: 1_level_1
1,19190
2,4163
0,1430


In [None]:
X = padded_tweet
y = df['class']

In [None]:
smote = SMOTE(sampling_strategy = 'minority')
X, y = smote.fit_resample(X, y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 123)

In [None]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(vocab_size, 50, input_length = sentence_length),
    tf.keras.layers.LSTM(128, return_sequences = True),
    tf.keras.layers.LSTM(50, return_sequences = True),
    tf.keras.layers.LSTM(128),
    tf.keras.layers.Dense(3, activation = 'softmax')
])



In [None]:
model.compile(optimizer = 'adam', loss = 'SparseCategoricalCrossentropy', metrics = ['accuracy'])

In [None]:
model.fit(X_train, y_train, epochs = 5, batch_size = 32)

Epoch 1/5
[1m1064/1064[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m84s[0m 73ms/step - accuracy: 0.7715 - loss: 0.5177
Epoch 2/5
[1m1064/1064[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 77ms/step - accuracy: 0.9445 - loss: 0.1720
Epoch 3/5
[1m1064/1064[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 76ms/step - accuracy: 0.9636 - loss: 0.1183
Epoch 4/5
[1m1064/1064[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 75ms/step - accuracy: 0.9736 - loss: 0.0868
Epoch 5/5
[1m1064/1064[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 75ms/step - accuracy: 0.9800 - loss: 0.0678


<keras.src.callbacks.history.History at 0x7b9419be9ed0>

In [None]:
loss, accuracy = model.evaluate(X_test, y_test)
print('Test Loss:', loss)
print('Test Accuracy:', accuracy)

[1m266/266[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 38ms/step - accuracy: 0.9076 - loss: 0.3746
Test Loss: 0.3800584375858307
Test Accuracy: 0.9051592350006104


In [None]:
y_pred = np.argmax(model.predict(X_test), axis = 1)

[1m266/266[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 28ms/step


In [None]:
print('Accuracy:', accuracy_score(y_test, y_pred))

Accuracy: 0.9051592431543072
