In [10]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras import backend as K


In [11]:
# Load the dataset
data = pd.read_csv('./labeled_data.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


In [12]:
def recall(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [13]:
# Extract the text and label columns
text = data['tweet']
labels = data['class']

# Tokenize
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text)
sequences = tokenizer.texts_to_sequences(text)
vocab_size = len(tokenizer.word_index) + 1

# Padding
max_length = max(len(seq) for seq in sequences)
padded_sequences = pad_sequences(sequences, maxlen=max_length)
max_sequence_length = max(len(seq) for seq in sequences)

# this for num of labels (harusnya ada 3)
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)
num_classes = len(label_encoder.classes_)
one_hot_labels = to_categorical(encoded_labels)

X_train, X_test, y_train, y_test = train_test_split(padded_sequences, one_hot_labels, test_size=0.2, random_state=42)

# Define the LSTM model
model = Sequential()
model.add(Embedding(vocab_size, 100, input_length=max_sequence_length))
model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))
model.add(Dropout(0.5))
model.add(Dense(128, activation="relu"))
model.add(Dropout(0.5))
model.add(Dense(num_classes, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy',f1,precision, recall])
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
108/620 [====>.........................] - ETA: 31s - loss: 0.0916 - accuracy: 0.9523 - f1: 0.9514 - precision: 0.9553 - recall: 0.9476

KeyboardInterrupt: 

In [7]:
import nltk
from nltk.corpus import stopwords
import re

### Preprocessing
it consist of:
- removing unique char (emoji, flags, non-alphabet char)
- url links
- stopwords
- changing @xxx into user
- null tweets

In [8]:
import demoji
import html

In [15]:
stop_words = set(stopwords.words('english'))

def remove_emojis(raw_text):
    text = html.unescape(raw_text)
    text = demoji.replace(text, '')
    return text

def remove_stopwords(raw_text):
    tokenize = nltk.word_tokenize(raw_text)
    text = [word for word in tokenize if not word.lower() in stop_words]
    text = " ".join(text)

    return text
# Preprocessing
def remove_url(raw_text):
    url_regex = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
    text = re.sub(url_regex, '', raw_text)

    return text

def change_user(raw_text):
    regex = r"@([^ ]+)"
    text = re.sub(regex, "user", raw_text)
    return text
        

In [17]:
test_list = [
    'got ya bitch tip toeing on my hardwood floors "" &#128514; http://t.co/cOU2WQ5L4q"',
    "pussy is a powerful drug @juanwmv "" &#128517; #HappyHumpDay http://t.co/R8jsymiB5b",
    "...Son of a bitch took my Tic Tacs.",
    '"@2015seniorprobs: I probably wouldn&#8217;t mind school as much if we didn&#8217;t have to deal with bitch ass teachers"". Retweet',
    '"""..All I wanna do is get money and fuck model bitches!"" - Russell Simmons"',
    "@AutoWorId: Hennessey Venom GT &#128584; http://t.co/i8eGMnKaJ9"" that's one sexy bitch"
]

for x in test_list:
    print(x)

got ya bitch tip toeing on my hardwood floors "" &#128514; http://t.co/cOU2WQ5L4q"
pussy is a powerful drug @juanwmv  &#128517; #HappyHumpDay http://t.co/R8jsymiB5b
...Son of a bitch took my Tic Tacs.
"@2015seniorprobs: I probably wouldn&#8217;t mind school as much if we didn&#8217;t have to deal with bitch ass teachers"". Retweet
"""..All I wanna do is get money and fuck model bitches!"" - Russell Simmons"
@AutoWorId: Hennessey Venom GT &#128584; http://t.co/i8eGMnKaJ9 that's one sexy bitch


In [18]:
def remove_noise(datas):
    clean = []
    # change the @xxx into "user"
    clean = [change_user(text) for text in datas]
    # remove emojis (specifically unicode emojis)
    clean = [remove_emojis(text) for text in clean]
    # remove urls
    clean = [remove_url(text) for text in clean]
    # remove stopwords
    # clean = [remove_stopwords(text) for text in clean]
    return clean

test_list = remove_noise(test_list)
for x in test_list:
    print(x)

got ya bitch tip toeing on my hardwood floors ""  "
pussy is a powerful drug user   #HappyHumpDay 
...Son of a bitch took my Tic Tacs.
"user I probably wouldn’t mind school as much if we didn’t have to deal with bitch ass teachers"". Retweet
"""..All I wanna do is get money and fuck model bitches!"" - Russell Simmons"
user Hennessey Venom GT   that's one sexy bitch


In [19]:
for x in stop_words:
    print(x)

no
ll
we
hers
will
d
re
a
between
are
couldn't
until
above
these
both
she
ours
once
on
with
themselves
that'll
doesn
am
it
shouldn
wasn't
mustn't
the
if
mightn
my
for
don't
again
then
most
this
having
where
her
s
won
shan't
aren
haven
against
below
it's
needn
theirs
yourself
hasn
you'll
very
ve
so
when
whom
aren't
because
yourselves
what
in
than
your
and
myself
needn't
after
those
do
such
under
can
shan
isn't
i
or
ourselves
t
why
while
me
as
nor
she's
isn
o
off
only
just
about
does
shouldn't
m
hadn
wouldn't
was
before
our
there
who
how
didn
doesn't
some
ain
weren't
couldn
yours
an
weren
y
you
each
which
to
out
here
be
through
himself
too
hasn't
herself
over
itself
down
more
own
its
during
same
further
by
should
ma
hadn't
didn't
don
them
their
at
now
you're
you'd
all
you've
doing
won't
they
has
he
into
haven't
that
from
did
other
is
been
not
wasn
his
were
being
few
wouldn
him
should've
mustn
mightn't
up
have
had
but
any
of


- got ya bitch tip toeing hardwood floors `` '' ``
- pussy powerful drug user # HappyHumpDay
- ... Son bitch took Tic Tacs .
- `` user probably ’ mind school much ’ deal bitch ass teachers '' '' . Retweet
- `` `` '' .. wan na get money fuck model bitches ! '' '' - Russell Simmons ''
- user Hennessey Venom GT 's one sexy bitch

got ya bitch tip toeing hardwood floors `` '' & # 128514 ; ``
pussy powerful drug user & # 128517 ; # HappyHumpDay
... Son bitch took Tic Tacs .
`` user probably & # 8217 ; mind school much & # 8217 ; deal bitch ass teachers '' '' . Retweet
`` `` '' .. wan na get money fuck model bitches ! '' '' - Russell Simmons ''
user Hennessey Venom GT & # 128584 ; 's one sexy bitch


In [None]:
# Split into training and testing sets
train_text, test_text, train_labels, test_labels = train_test_split(text, labels, test_size=0.2, random_state=42)

# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_text)
vocab_size = len(tokenizer.word_index) + 1

# Convert text to sequences
train_sequences = tokenizer.texts_to_sequences(train_text)
test_sequences = tokenizer.texts_to_sequences(test_text)

# Pad sequences to ensure uniform length
max_sequence_length = max(len(seq) for seq in train_sequences)
train_sequences = pad_sequences(train_sequences, maxlen=max_sequence_length, padding='post')
test_sequences = pad_sequences(test_sequences, maxlen=max_sequence_length, padding='post')

# Define the LSTM model
model = Sequential()
model.add(Embedding(vocab_size, 100, input_length=max_sequence_length))
model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='softmax'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(train_sequences, train_labels, epochs=10, batch_size=32, validation_split=0.2)

# Evaluate the model
test_loss, test_acc = model.evaluate(test_sequences, test_labels)

# Make predictions
predictions = model.predict(test_sequences)