In [1]:
import pandas as pd
import numpy as np
import re
import nltk
nltk.download('stopwords')
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
%matplotlib inline
from matplotlib import pyplot as plt
import warnings
warnings.filterwarnings("ignore")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Hu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
dataset = pd.read_csv('dataset.csv', index_col = 'Unnamed: 0')
print(dataset.shape)

(24783, 6)


In [3]:
dataset.index=[np.arange(24783)]

In [4]:
dataset.head()

Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


In [5]:
y=dataset['class'].values
y

array([2, 1, 1, ..., 1, 1, 2], dtype=int64)

## Preprocess

In [6]:
# data cleaning
stemmer = nltk.PorterStemmer()
stopword = set(stopwords.words('english'))
new_tweet=[]
import string

def data_preprocess(review):
    review = re.sub(r'^\s+|\s+?$','',review) # remove leading and trailing whitespace
    review = re.sub(r'@[\w\-]+', '', review) # remove mentions
    review = re.sub(r'RT[\s]+', '', review) #remove retweet text 'RT'
    review = re.sub('[^a-zA-Z]',' ',review)#replace non-character with space
    review = review.lower() #lower the text
    review = re.sub('\[.*?\]', '', review)
    review = re.sub('https?://\S+|www\.\S+', '', review) # remove hyperlinks
    review = re.sub(r'#', '', review) # remove hashtags
    review = re.sub('[%s]' % re.escape(string.punctuation), '', review) # remove escape characters
    review = re.sub('\n', '', review)
    review = re.sub('\w*\d\w*', '', review)
    review = [word for word in review.split(' ') if word not in stopword and word not in string.punctuation] #remove stopwords and punctuations and tokenize
    review=" ".join(review)
    review = [stemmer.stem(word) for word in review.split(' ')] #use PorterStemmer
    
    review=" ".join(review)
    new_tweet.append(review)
    return review

In [7]:
dataset['processed_tweet'] = dataset['tweet'].apply(data_preprocess)

In [8]:
dataset[['tweet','processed_tweet']].head()

Unnamed: 0,tweet,processed_tweet
0,!!! RT @mayasolovely: As a woman you shouldn't...,woman complain clean hous amp man alway take t...
1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,boy dat cold tyga dwn bad cuffin dat hoe st place
2,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,dawg ever fuck bitch start cri confus shit
3,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,look like tranni
4,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,shit hear might true might faker bitch told ya


## Embedding

In [9]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(dataset["processed_tweet"].apply(lambda x: x.split(" ")))]

# training of the model
doc2vec_model = Doc2Vec(documents,vector_size=200, window=2, min_count=1, workers=4)

# transform each document (tweet) into a vector data
doc2vec_features = dataset["processed_tweet"].apply(lambda x: doc2vec_model.infer_vector(x.split(" "))).apply(pd.Series)
doc2vec_features.columns = ["doc2vec_vector_" + str(x) for x in doc2vec_features.columns]

In [10]:
doc2vec_features

Unnamed: 0,doc2vec_vector_0,doc2vec_vector_1,doc2vec_vector_2,doc2vec_vector_3,doc2vec_vector_4,doc2vec_vector_5,doc2vec_vector_6,doc2vec_vector_7,doc2vec_vector_8,doc2vec_vector_9,...,doc2vec_vector_190,doc2vec_vector_191,doc2vec_vector_192,doc2vec_vector_193,doc2vec_vector_194,doc2vec_vector_195,doc2vec_vector_196,doc2vec_vector_197,doc2vec_vector_198,doc2vec_vector_199
0,0.002370,0.001721,-0.005792,-0.000094,0.015952,-0.015845,-0.021174,0.022643,-0.004967,0.006594,...,-0.003230,-0.007095,0.034785,-0.013830,-0.010418,0.014935,0.010801,0.010530,-0.000150,0.004209
1,-0.014424,0.005881,0.005284,0.018182,0.021418,-0.021799,-0.005721,0.055685,-0.010746,0.007668,...,0.032443,-0.008155,0.000367,-0.022808,0.017549,0.017183,0.020983,-0.028478,-0.006098,0.007463
2,0.021725,0.006636,-0.006318,-0.006288,0.000538,-0.010310,0.005514,0.005221,0.014855,-0.034314,...,-0.013837,0.009467,-0.008689,-0.005960,-0.024160,-0.016575,-0.001938,0.016695,0.007768,0.009809
3,-0.007255,0.004057,0.001803,0.015086,0.009713,-0.001777,0.006810,0.007078,-0.006692,0.007976,...,0.005074,0.010969,-0.002338,-0.006479,0.009347,-0.001338,-0.000663,-0.009548,0.011847,-0.002020
4,0.022911,-0.002314,-0.006026,0.003586,0.013179,-0.017380,0.002289,0.038162,0.012686,-0.027958,...,0.005699,-0.004371,-0.002189,-0.005577,-0.015894,0.000340,0.015489,0.001899,-0.007307,0.005237
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24778,-0.007706,-0.015174,-0.005996,0.029347,0.018690,-0.007605,-0.002446,0.078190,-0.026010,0.037534,...,0.067985,-0.016618,-0.010740,-0.017850,0.045248,0.034561,0.020894,-0.059704,-0.011045,-0.033884
24779,0.008493,0.009701,-0.003325,0.012578,0.013831,-0.012190,-0.007050,0.030642,0.005496,-0.015047,...,0.003315,-0.006145,0.004209,-0.014305,-0.008194,-0.000293,0.004412,-0.000980,0.000244,0.009077
24780,0.021856,0.004335,0.017400,0.004199,0.020049,-0.011674,-0.005598,0.010822,0.025686,-0.038614,...,-0.007026,0.000277,-0.004181,0.000430,-0.026948,-0.017452,0.018596,0.013746,0.002697,0.034742
24781,0.001357,0.002004,-0.003960,0.014977,0.013875,-0.008794,0.000261,0.043789,-0.008657,0.008219,...,0.028393,-0.008763,-0.006103,-0.011387,0.011383,0.015028,0.011428,-0.023675,-0.003432,-0.003554


In [11]:
# find max length of tweets dataset['processed_tweet']
maxlen = -1
for i, rev in enumerate(new_tweet):
    tweet = rev.split()
    if (len(tweet)>maxlen):
        maxlen = len(tweet)
maxlen

28

In [12]:
tokenized_tweet = dataset['processed_tweet'].apply(lambda x:x.split())

In [23]:
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(tokenized_tweet)
X = tokenizer.texts_to_sequences(tokenized_tweet)

In [24]:
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
X = pad_sequences(X, padding='pre',maxlen=28)
X.shape

(24783, 28)

In [25]:
vocab = doc2vec_model.wv.key_to_index.keys()
len(vocab)

19219

In [26]:
word_vec_dict = {}
for word in vocab:
    word_vec_dict[word] = doc2vec_model.wv.get_vector(word)

In [27]:
vocab_size = len(tokenizer.word_index) + 1
w_matrix = np.zeros((vocab_size, 200))

for word, i in tokenizer.word_index.items():
    embedd_vector = word_vec_dict.get(word)
    if embedd_vector is not None:
        w_matrix[i] = embedd_vector

w_matrix

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.41023266,  0.3140282 , -0.04731553, ..., -0.15827499,
         0.05977133,  0.32247439],
       [ 0.36642444,  0.22944017, -0.1887005 , ..., -0.24390805,
         0.02027663,  0.20542215],
       ...,
       [-0.00754355,  0.00188093,  0.00050273, ..., -0.03198026,
        -0.00378917, -0.01197336],
       [ 0.01364848, -0.00199141,  0.00249187, ..., -0.0145713 ,
        -0.00096715, -0.00157534],
       [ 0.00086747, -0.00748972,  0.00301885, ..., -0.01123519,
        -0.00317441, -0.00527421]])

## LSTM model

In [28]:
from tensorflow.keras.layers import Flatten, Dropout, Dense, LSTM, Embedding, Activation, Bidirectional
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from keras.callbacks import EarlyStopping
from keras.initializers import Constant
from keras.layers.convolutional import MaxPooling1D, Conv1D

In [29]:
model = Sequential()
model.add(Embedding(input_dim = vocab_size, output_dim = 200, input_length = maxlen, embeddings_initializer=Constant(w_matrix),trainable=False)) 
model.add(Dropout(0.2)) #0.2

model.add(Bidirectional(LSTM(64))) #64
model.add(Dropout(0.2))

model.add(Dense(64, activation='relu'))
model.add(Dropout(0.2))

model.add(Dense(64, activation='relu'))
model.add(Dropout(0.2))

model.add(Dense(1, activation = 'linear'))

In [30]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 28, 200)           3843800   
                                                                 
 dropout_4 (Dropout)         (None, 28, 200)           0         
                                                                 
 bidirectional_1 (Bidirectio  (None, 128)              135680    
 nal)                                                            
                                                                 
 dropout_5 (Dropout)         (None, 128)               0         
                                                                 
 dense_3 (Dense)             (None, 64)                8256      
                                                                 
 dropout_6 (Dropout)         (None, 64)                0         
                                                      

In [31]:
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = 'accuracy')

In [32]:
from sklearn.model_selection import train_test_split
epochs = 50
batch_size = 32
x_train, x_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=0)
hist = model.fit(x_train, y_train, validation_data = (x_test, y_test), epochs = epochs,
                 batch_size = batch_size, shuffle=True)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
