In [1]:
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords


basic_stopwords = stopwords.words('english')

df = pd.read_csv("dataset2.csv",encoding='latin')
df = df.drop(df.columns[[1,2,3,4]],axis=1)
df = df.rename(columns={df.columns[0]: 'target', df.columns[1]: 'tweet'})


In [2]:

def preprocess(tweet):
    tweet = tweet.lower() 
    tweet = re.sub('@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+',' ', tweet)
    tweet = nltk.word_tokenize(tweet)
    tweet = [word for word in tweet if word not in basic_stopwords and word.isalnum()]
    return tweet

df.tweet =df.tweet.apply(lambda x:preprocess(x)) 



In [3]:
from gensim.models import Word2Vec

model = Word2Vec(df.tweet,window=4,workers=4,min_count=1)
all_normed_vectors = model.wv.get_normed_vectors()
model.save('model.bin')

In [19]:
filename = 'model.txt'
model.wv.save_word2vec_format(filename,binary=False)

In [43]:
import numpy as np
word_vectors = {}

file = open('model.txt',encoding='utf-8')

for line in file:
    vector_values = line.split()
    vector = np.asarray(vector_values[1:])
    word_vectors[vector_values[0]] = vector
    
    

In [49]:
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n')

tokenizer.fit_on_texts(df.tweet)

In [51]:
sequences = tokenizer.texts_to_sequences(df.tweet)



In [57]:
vocabulary = tokenizer.word_index

In [62]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
padded = pad_sequences(sequences)

In [76]:
num_words = len(vocabulary) + 1

embedded_matrix = np.zeros((num_words,100))
for word,i in vocabulary.items():
    embedded_vector = word_vectors.get(word)
    embedded_matrix[i] = embedded_vector
    

In [108]:
from keras.models import Sequential
from keras.models import load_model
from keras.layers import Embedding, LSTM,GRU
from keras.layers.core import Dense, Dropout
from keras.callbacks import ReduceLROnPlateau
from keras.initializers import Constant
embedding_dim = 100
model = Sequential()
embedding_layer = Embedding(num_words, 
                            embedding_dim, 
                            embeddings_initializer=Constant(embedded_matrix), 
                            input_length = 50, 
                            trainable = False)
model.add(embedding_layer)
model.add(GRU(units=32,dropout=0.2,recurrent_dropout=0.2))
model.add(Dense(1,activation='sigmoid'))

model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [109]:
print(model.summary())

Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 50, 100)           33550300  
                                                                 
 gru_3 (GRU)                 (None, 32)                12864     
                                                                 
 dense_3 (Dense)             (None, 1)                 33        
                                                                 
Total params: 33,563,197
Trainable params: 12,897
Non-trainable params: 33,550,300
_________________________________________________________________
None


In [110]:
x_train = padded[:1200000]
x_test = padded[1200001:]
y_train = df.loc[1:1200000,'target'].values
y_test = df.loc[1200001:,'target'].values

In [113]:
history = model.fit(x_train, y_train, batch_size = 128, epochs = 10,
                    validation_data =(x_test,y_test), verbose = 1)
model.save("my_model")

Epoch 1/10
Epoch 2/10
1010/9375 [==>...........................] - ETA: 3:13 - loss: -263.5376 - accuracy: 0.2104

KeyboardInterrupt: 

In [107]:
print(len(padded[0]))

50
