In [1]:
import pandas as pd
import numpy as np
import re
import nltk
nltk.download('stopwords')
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
%matplotlib inline
from matplotlib import pyplot as plt
import warnings
warnings.filterwarnings("ignore")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Hu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
dataset = pd.read_csv('preprocessed_data.csv')
print(dataset.shape)

(24783, 8)


## Preprocess

In [3]:
# check for missing values
dataset.isnull().sum()

Unnamed: 0            0
count                 0
hate_speech           0
offensive_language    0
neither               0
class                 0
tweet                 0
processed_tweet       2
dtype: int64

In [4]:
dataset.dropna(axis=0,how='any',inplace=True)

In [5]:
dataset.shape

(24781, 8)

## Embedding

In [6]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(dataset["processed_tweet"].apply(lambda x: str(x).split(" ")))]

# training of the model
doc2vec_model = Doc2Vec(documents,vector_size=100, window=2, min_count=1, workers=4)

# transform each document (tweet) into a vector data
doc2vec_features = dataset["processed_tweet"].apply(lambda x: doc2vec_model.infer_vector(x.split(" "))).apply(pd.Series)
doc2vec_features.columns = ["doc2vec_vector_" + str(x) for x in doc2vec_features.columns]

In [7]:
doc2vec_features

Unnamed: 0,doc2vec_vector_0,doc2vec_vector_1,doc2vec_vector_2,doc2vec_vector_3,doc2vec_vector_4,doc2vec_vector_5,doc2vec_vector_6,doc2vec_vector_7,doc2vec_vector_8,doc2vec_vector_9,...,doc2vec_vector_90,doc2vec_vector_91,doc2vec_vector_92,doc2vec_vector_93,doc2vec_vector_94,doc2vec_vector_95,doc2vec_vector_96,doc2vec_vector_97,doc2vec_vector_98,doc2vec_vector_99
0,0.007499,-0.003608,-0.001642,0.025057,-0.018753,0.007608,0.012677,0.018329,-0.031626,0.010353,...,0.001433,-0.002859,-0.000114,-0.006959,0.027266,-0.018546,0.014691,0.027791,-0.015647,-0.005814
1,-0.029234,0.013032,0.028907,0.009010,0.001612,-0.044108,0.013316,0.065313,-0.026325,-0.000356,...,0.040774,0.028754,0.028741,-0.014615,0.062676,0.018202,-0.003816,-0.038756,0.029557,0.002088
2,-0.020330,0.011608,-0.005730,0.000129,-0.006354,-0.003547,0.016893,-0.012370,-0.011603,-0.016983,...,-0.001925,0.008584,-0.014032,0.017906,-0.017390,0.009108,0.004429,-0.000281,0.005516,0.009672
3,0.003395,0.011350,-0.005278,0.000222,0.027390,-0.013552,0.003472,0.013267,0.007195,-0.005738,...,0.008400,0.016836,0.024767,-0.009926,0.025036,0.018513,0.009169,0.001015,0.010097,-0.017305
4,-0.020420,0.017002,0.011147,-0.009374,-0.012437,-0.022879,0.020928,0.027851,-0.013634,-0.008545,...,0.021988,0.005240,-0.014586,0.008590,0.009662,0.014557,0.002863,-0.018876,0.003080,0.011958
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24778,0.010484,0.024478,0.039817,0.013727,0.008760,-0.069086,-0.002619,0.136675,-0.063903,0.018723,...,0.065459,0.012917,0.029792,-0.026898,0.138854,0.021354,0.022084,-0.040775,0.012494,0.001791
24779,-0.025994,0.012341,0.025043,0.000238,-0.019190,-0.030451,0.008782,0.034775,-0.026322,-0.008368,...,0.024668,0.010854,0.000534,0.002048,0.031659,0.017184,-0.002900,-0.017701,-0.002657,0.012225
24780,-0.070315,0.021987,-0.001204,0.000615,0.020636,0.008880,0.059381,-0.015007,0.007781,-0.036368,...,0.052554,0.004320,-0.016332,0.004185,-0.025229,0.026032,-0.003911,0.002419,0.008153,-0.021806
24781,-0.001312,0.009120,0.024085,-0.008341,-0.006602,-0.022620,-0.004195,0.033900,-0.017632,-0.007274,...,0.024818,0.016793,0.005234,0.006163,0.030537,0.012518,0.011066,-0.028579,0.009684,0.012363


In [8]:
# find max length of tweets dataset['processed_tweet']
maxlen = -1
for i, rev in enumerate(dataset['processed_tweet']):
    tweet = str(rev).split()
    if (len(tweet)>maxlen):
        maxlen = len(tweet)
maxlen

28

In [9]:
tokenized_tweet = dataset['processed_tweet'].apply(lambda x:str(x).split())

In [10]:
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(tokenized_tweet)
X = tokenizer.texts_to_sequences(tokenized_tweet)

In [11]:
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
X = pad_sequences(X, padding='pre',maxlen=28)
X.shape

(24781, 28)

In [12]:
vocab = doc2vec_model.wv.key_to_index.keys()
len(vocab)

15476

In [13]:
word_vec_dict = {}
for word in vocab:
    word_vec_dict[word] = doc2vec_model.wv.get_vector(word)

In [14]:
vocab_size = len(tokenizer.word_index) + 1
w_matrix = np.zeros((vocab_size, 100))

for word, i in tokenizer.word_index.items():
    embedd_vector = word_vec_dict.get(word)
    if embedd_vector is not None:
        w_matrix[i] = embedd_vector

w_matrix

array([[ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [-1.02485037e+00,  7.47610569e-01,  7.57614136e-01, ...,
        -6.61297619e-01,  2.10512787e-01,  1.46599621e-01],
       [-9.28829312e-01,  6.89395010e-01,  7.11957693e-01, ...,
        -5.32279611e-01, -6.92608431e-02,  3.35961163e-01],
       ...,
       [ 8.38220178e-04,  1.18158972e-02,  2.62711029e-02, ...,
        -2.92869899e-02,  2.08479371e-02,  2.62835016e-03],
       [-2.78962310e-03,  1.99453868e-02,  3.71178575e-02, ...,
        -3.75461020e-02,  1.76950395e-02,  7.41720106e-03],
       [ 3.57250031e-03, -1.12868845e-03,  1.33451689e-02, ...,
         5.45247179e-03,  2.04785727e-03, -1.34387822e-03]])

## LSTM model

In [15]:
from tensorflow.keras.layers import Flatten, Dropout, Dense, LSTM, Embedding, Activation, Bidirectional
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from keras.callbacks import EarlyStopping
from keras.initializers import Constant
from keras.layers.convolutional import MaxPooling1D, Conv1D

In [16]:
model = Sequential()
model.add(Embedding(input_dim = vocab_size, output_dim = 100, input_length = maxlen, embeddings_initializer=Constant(w_matrix))) 
model.add(Dropout(0.2)) #0.2

model.add(Bidirectional(LSTM(64))) #64
model.add(Dropout(0.2))

model.add(Dense(64, activation='relu'))
model.add(Dropout(0.2))

model.add(Dense(64, activation='relu'))
model.add(Dropout(0.2))

model.add(Dense(1, activation = 'linear'))

In [17]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 28, 100)           1547700   
                                                                 
 dropout (Dropout)           (None, 28, 100)           0         
                                                                 
 bidirectional (Bidirectiona  (None, 128)              84480     
 l)                                                              
                                                                 
 dropout_1 (Dropout)         (None, 128)               0         
                                                                 
 dense (Dense)               (None, 64)                8256      
                                                                 
 dropout_2 (Dropout)         (None, 64)                0         
                                                        

In [18]:
y=dataset['class'].values
y.shape

(24781,)

In [19]:
model.compile(loss = 'mean_squared_error', optimizer = 'adam', metrics = 'accuracy')

In [20]:
from sklearn.model_selection import train_test_split
epochs = 50
batch_size = 32
x_train, x_test, y_train, y_test = train_test_split(X,y,test_size=0.1,random_state=0)
hist = model.fit(x_train, y_train, validation_data = (x_test, y_test), epochs = epochs,
                 batch_size = batch_size, shuffle=True)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
