In [1]:
import pandas as pd
import numpy as np
import re
import nltk
nltk.download('stopwords')
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Hu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
dataset = pd.read_csv('preprocessed_data.csv')
print(dataset.shape)

(24783, 8)


In [3]:
y=dataset['class'].values
y
## 0:hate speech; 1:offensive language; 2:neither

array([2, 1, 1, ..., 1, 1, 2], dtype=int64)

## Preprocess

In [7]:
# check for missing values
dataset.isnull().sum()

Unnamed: 0            0
count                 0
hate_speech           0
offensive_language    0
neither               0
class                 0
tweet                 0
processed_tweet       2
dtype: int64

In [8]:
dataset.dropna(axis=0,how='any',inplace=True)

In [9]:
dataset.shape

(24781, 8)

In [None]:
#import nltk.data
#nltk.download('punkt')
#tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [None]:
# split tweets into lists of sentences where each sentence is a list of words
#def tweet_tokenize(review, tokenizer, remove_sw=False):
#    tokenized_tweets = tokenizer.tokenize(review)
#    new_tweet = []
#    for tokenized_tweet in tokenized_tweets:
#        if len(tokenized_tweet) > 0:
#            new_tweet.append(data_preprocess(tokenized_tweet, remove_sw))
#    return new_tweet

In [None]:
#new_tweets = []
#for review in dataset['tweet']:
#    new_tweets += tweet_tokenize(review,tokenizer)

In [None]:
#new_tweets

In [10]:
# word2vec model
# simple tokenize
texts = dataset['processed_tweet']
tokenized_tweet = texts.apply(lambda x:str(x).split())

from gensim.models import word2vec
size = 200
min_count = 2
workers = 4
window = 10
sg = 1

w2v_model = word2vec.Word2Vec(tokenized_tweet,
                              vector_size = size,
                              min_count = min_count,
                              window = window,
                              sg = 1,
                              workers = workers)
model_name = "simplew2v"
w2v_model.save(model_name)
                    

In [11]:
# most similar words present in the model
w2v_model.wv.most_similar('woman')

[('dress', 0.9455119967460632),
 ('way', 0.9453248977661133),
 ('nice', 0.9434027671813965),
 ('truth', 0.9430713057518005),
 ('repli', 0.941224217414856),
 ('drive', 0.9405822157859802),
 ('chick', 0.9402672052383423),
 ('fun', 0.9395648837089539),
 ('asshol', 0.9383838772773743),
 ('enough', 0.9382278919219971)]

In [12]:
# check the vector representation for any word from our corpus
w2v_model.wv.get_vector('shit')

array([ 0.18818495, -0.16028605,  0.0977423 , -0.01033437,  0.03547648,
       -0.19841255,  0.16995348,  0.23270577, -0.12913476, -0.06824369,
        0.02428589, -0.19027972, -0.09466037,  0.15572235, -0.19010308,
        0.06238107,  0.01550262,  0.11662454, -0.09587722, -0.33045298,
        0.32646105, -0.07574545,  0.20194137,  0.06717928, -0.05720104,
        0.01044636, -0.07381954, -0.04111833, -0.10158026,  0.03280369,
        0.17351843,  0.05731299,  0.16931362,  0.13573097,  0.10728898,
       -0.18229297,  0.24536929, -0.21144088, -0.18908644, -0.19703043,
       -0.08628925,  0.00425185, -0.12827305,  0.1360265 ,  0.12387665,
        0.07816706, -0.0078771 , -0.00748067,  0.01100683, -0.0096606 ,
       -0.18109472, -0.23633201, -0.14294688, -0.03657249,  0.16448505,
       -0.04117519, -0.1045759 , -0.12082563, -0.29745558,  0.05899406,
       -0.00955477,  0.02441569,  0.08238562,  0.11793819, -0.39334965,
        0.19249637,  0.05157324,  0.42357603, -0.29402754,  0.35

In [13]:
# check the similarity of two words
w2v_model.wv.similarity('man','dude')

0.85078007

In [14]:
# retrieve the weights from the model
w2v_weights = w2v_model.wv.vectors
w2v_weights

array([[ 0.19240057,  0.11275952,  0.11498567, ..., -0.15782012,
        -0.09665237,  0.27106634],
       [ 0.22625284,  0.08221275, -0.0785579 , ..., -0.1078688 ,
         0.0409813 ,  0.1591981 ],
       [ 0.05883907, -0.12780347, -0.09040714, ..., -0.06191109,
         0.07760243, -0.09103731],
       ...,
       [ 0.03057816, -0.00221626,  0.03972917, ..., -0.01921675,
         0.00532114, -0.0220346 ],
       [ 0.03632252,  0.00739879,  0.04732922, ..., -0.02646327,
        -0.0070594 , -0.02994229],
       [ 0.02892422,  0.01274518,  0.04523262, ..., -0.02520522,
         0.00271638, -0.03352645]], dtype=float32)

In [15]:
#vocab_size, embedding_size = w2v_weights.shape
#print("Vocabulary Size: {} - Embedding Dim: {}".format(vocab_size, embedding_size))

In [16]:
vocab = w2v_model.wv.key_to_index.keys()
len(vocab)

7470

In [17]:
word_vec_dict = {}
for word in vocab:
    word_vec_dict[word] = w2v_model.wv.get_vector(word)
print("The no of key-value pairs : ",len(word_vec_dict))

The no of key-value pairs :  7470


In [18]:
# find max length of tweets dataset['processed_tweet']
maxlen = -1
for i, rev in enumerate(dataset['processed_tweet']):
    tweet = str(rev).split()
    if (len(tweet)>maxlen):
        maxlen = len(tweet)
maxlen

28

In [19]:
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(tokenized_tweet)
X = tokenizer.texts_to_sequences(tokenized_tweet)

In [20]:
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
X = pad_sequences(X, padding='pre',maxlen=28)
X.shape # 24781 tweets, and have padded each tweet to be of max length

(24781, 28)

In [21]:
vocab_size = len(tokenizer.word_index) + 1
w_matrix = np.zeros((vocab_size, size))

for word, i in tokenizer.word_index.items():
    embedd_vector = word_vec_dict.get(word)
    if embedd_vector is not None:
        w_matrix[i] = embedd_vector

w_matrix

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.19240057,  0.11275952,  0.11498567, ..., -0.15782012,
        -0.09665237,  0.27106634],
       [ 0.22625284,  0.08221275, -0.0785579 , ..., -0.1078688 ,
         0.0409813 ,  0.15919811],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

## simple LSTM model

In [22]:
from tensorflow.keras.layers import Flatten, Dropout, Dense, LSTM, Embedding
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from keras.callbacks import EarlyStopping
from keras.initializers import Constant

In [23]:
model = Sequential()
model.add(Embedding(input_dim = vocab_size, output_dim = size, input_length = maxlen, embeddings_initializer=Constant(w_matrix))) 
model.add(Dropout(0.2))

model.add(LSTM(64))
model.add(Dropout(0.2))

model.add(Dense(64, activation='relu'))
model.add(Dropout(0.2))

model.add(Dense(64, activation='relu'))
model.add(Dropout(0.2))

model.add(Dense(1, activation = 'linear'))

In [24]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 28, 200)           3095400   
                                                                 
 dropout (Dropout)           (None, 28, 200)           0         
                                                                 
 lstm (LSTM)                 (None, 64)                67840     
                                                                 
 dropout_1 (Dropout)         (None, 64)                0         
                                                                 
 dense (Dense)               (None, 64)                4160      
                                                                 
 dropout_2 (Dropout)         (None, 64)                0         
                                                                 
 dense_1 (Dense)             (None, 64)                4

In [25]:
model.compile(loss = 'mean_squared_error', optimizer = 'adam', metrics = 'accuracy')

In [27]:
y=dataset['class'].values
y.shape

(24781,)

In [28]:
from sklearn.model_selection import train_test_split
epochs = 50
batch_size = 32
x_train, x_test, y_train, y_test = train_test_split(X,y,test_size=0.1,random_state=0)
hist = model.fit(x_train, y_train, validation_data = (x_test, y_test), epochs = epochs,
                 batch_size = batch_size, shuffle=True)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
