In [48]:
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords


basic_stopwords = stopwords.words('english')

df = pd.read_csv("dataset2.csv",encoding='latin')
df = df.drop(df.columns[[1,2,3,4]],axis=1)
df = df.rename(columns={df.columns[0]: 'target', df.columns[1]: 'tweet'})


In [52]:
import string
punctuation = string.punctuation
def preprocess(tweet):
    tweet = tweet.lower()
    tweet = re.sub('((www.[^s]+)|(https?://[^s]+))',' ',tweet)
    tweet = nltk.word_tokenize(tweet)
    remove_punctuation = str.maketrans('', '', punctuation)
    tweet = [word.translate(remove_punctuation) for word in tweet if word not in basic_stopwords and word.isalnum() and not word.isdigit()]
    return tweet
df.tweet =df.tweet.apply(lambda x:preprocess(x)) 


['jonasbrothers', 'haii', 'guys', 'love', 'u', 'much', 'cum', 'ur', 'cumin', 'australia', 'ur', 'world', 'tour', 'cos', 'cnt', 'cum', 'see', 'u', 'guys']


In [61]:
from gensim.models import Word2Vec

model = Word2Vec(df.tweet,window=4,workers=4,min_count=1)
all_normed_vectors = model.wv.get_normed_vectors()
model.save('model.bin')

In [62]:
filename = 'model.txt'
model.wv.save_word2vec_format(filename,binary=False)

In [63]:
import numpy as np
word_vectors = {}

file = open('model.txt',encoding='utf-8')

for line in file:
    vector_values = line.split()
    vector = np.asarray(vector_values[1:])
    word_vectors[vector_values[0]] = vector
    
    

In [64]:
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n')

tokenizer.fit_on_texts(df.tweet)

In [65]:
sequences = tokenizer.texts_to_sequences(df.tweet)

In [66]:
vocabulary = tokenizer.word_index

In [77]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
padded = pad_sequences(sequences)

In [90]:
num_words = len(vocabulary) + 1

embedded_matrix = np.zeros((num_words,100))
for word,i in vocabulary.items():
    embedded_vector = word_vectors.get(word)
    embedded_matrix[i] = embedded_vector



In [101]:

from keras.models import Sequential
from keras.models import load_model
from keras.layers import Embedding, LSTM,GRU
from keras.layers.core import Dense, Dropout
from keras.callbacks import ReduceLROnPlateau
from keras import backend as K
from keras.initializers import Constant


embedding_dim = 100
model = Sequential()
embedding_layer = Embedding(num_words, 
                            embedding_dim, 
                            embeddings_initializer=Constant(embedded_matrix), 
                            input_length = 50, 
                            trainable = False)
model.add(embedding_layer)
model.add(GRU(units=32,dropout=0.2,recurrent_dropout=0.2))
model.add(Dense(1,activation='sigmoid'))

model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [113]:
from tensorflow.keras.layers import Bidirectional, GlobalMaxPool1D, Dense, LSTM, Conv1D, Embedding
embedding_layer = Embedding(num_words, 
                            embedding_dim, 
                            embeddings_initializer=Constant(embedded_matrix), 
                            input_length = 50, 
                            trainable = False)
model = Sequential([
        embedding_layer,
        Bidirectional(LSTM(100, dropout=0.3, return_sequences=True)),
        Bidirectional(LSTM(100, dropout=0.3, return_sequences=True)),
        Conv1D(100, 5, activation='relu'),
        GlobalMaxPool1D(),
        Dense(16, activation='relu'),
        Dense(1, activation='sigmoid'),
    ])


In [114]:
print(model.summary())

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_5 (Embedding)     (None, 50, 100)           57304300  
                                                                 
 bidirectional_4 (Bidirectio  (None, 50, 200)          160800    
 nal)                                                            
                                                                 
 bidirectional_5 (Bidirectio  (None, 50, 200)          240800    
 nal)                                                            
                                                                 
 conv1d_2 (Conv1D)           (None, 46, 100)           100100    
                                                                 
 global_max_pooling1d (Globa  (None, 100)              0         
 lMaxPooling1D)                                                  
                                                      

In [115]:

x_train = padded[:1200000]
x_test = padded[1200001:]
y_train = df.loc[1:1200000,'target'].values
y_test = df.loc[1200001:,'target'].values



In [119]:
from tensorflow.keras.callbacks import ReduceLROnPlateau
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping

callbacks = [ReduceLROnPlateau(monitor='val_loss', patience=5, cooldown=0),
             EarlyStopping(monitor='val_accuracy', min_delta=1e-4, patience=5)]
history = model.fit(
    x_train, y_train,
    batch_size=100,
    epochs=12,
    validation_split=0.1,
    callbacks=callbacks,
    verbose=1,
)

Epoch 1/12
  124/10800 [..............................] - ETA: 31:00 - loss: -207.6070 - accuracy: 0.0974

KeyboardInterrupt: 

In [110]:
history = model.fit(x_train, y_train, batch_size = 128, epochs = 10,
                    validation_data =(x_test,y_test), callbacks=[ReduceLROnPlateau])
model.save("my_model")

Epoch 1/10
 525/9375 [>.............................] - ETA: 6:48 - loss: -281.2361 - accuracy: 0.0037

KeyboardInterrupt: 

In [107]:
print(len(padded[0]))

50
