In [1]:
import pandas as pd
import numpy as np
import regex as re
import nltk
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords

import tensorflow as tf
from tensorflow.python.keras.preprocessing import sequence
from tensorflow.python.keras.preprocessing import text
from sklearn.model_selection import train_test_split

from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM,Bidirectional, GlobalMaxPool1D,Dropout,Flatten
from keras.models import load_model
from keras.callbacks import ModelCheckpoint

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [2]:
data = pd.read_csv('final_texts.csv',index_col=0)
data.reset_index(drop=True,inplace = True)

for i,j in enumerate(data['labels']):
  try:
    j = int(j)
  except ValueError as e:
    print(f'error on {i} line')
data.drop(labels=[2478],inplace= True)
data['labels'].astype(int);
data.dropna(inplace = True)

error on 2478 line


In [3]:
cleaned_text = []
for text in data['texts']:
  text = " ".join(word for word in text.split() if not word.isdigit())
  cleaned_text.append(text)
data['cleaned_text'] = cleaned_text

In [4]:
vocab =  {}
for text in data['cleaned_text']:
  sen = text.split()
  for word in sen:
    try:
      vocab[word] += 1
    except KeyError:
      vocab[word] = 1
vocab = dict(sorted(vocab.items(), key=lambda item: item[1]))

In [5]:
rare_words = []
for key,value in vocab.items():
  if value<=10:
    rare_words.append(key)

In [6]:
stopwords_en = set(stopwords.words('english'))
cleaner_text = []
for text in data['cleaned_text']:
  text = " ".join([word for word in text.split() if len(word)>2 and word not in stopwords_en and word not in rare_words])
  cleaner_text.append(text)
data['final_text'] = cleaner_text

In [7]:
vocab =  {}
for text in data['final_text']:
  sen = text.split()
  for word in sen:
    try:
      vocab[word] += 1
    except KeyError:
      vocab[word] = 1
vocab = dict(sorted(vocab.items(), key=lambda item: item[1]))

In [8]:
vocab_list = list(vocab.items())
vocab_size = len(vocab)

In [9]:
x = data['final_text'].values
y = data['labels'].values
X_train,X_test,y_train, y_test = train_test_split(x,y, test_size = 0.2, shuffle = True)
y_train = y_train.astype(int)
y_test = y_test.astype(int)

In [10]:
embeddings_index = dict()
f = open('glove.twitter.27B.200d.txt')
for line in f:
	values = line.split()
	word = values[0]
	coefs = np.asarray(values[1:], dtype='float32')
	embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 1216 word vectors.


In [11]:
from tensorflow.python.keras.preprocessing import sequence
from tensorflow.python.keras.preprocessing import text
tokenizer = text.Tokenizer()
tokenizer.fit_on_texts(X_train)
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)
X_train = sequence.pad_sequences(X_train,maxlen=200)
X_test = sequence.pad_sequences(X_test,maxlen=200)

In [12]:
tokens = len(tokenizer.word_index) + 2
embedding_matrix = np.zeros((tokens, 200))
count = 0
unknown = []
for word, i in tokenizer.word_index.items():
  embedding_vector = embeddings_index.get(word)
  if embedding_vector is not None:
    try:
      embedding_matrix[i] = embedding_vector
    except ValueError:
      unknown.append(word)
      count += 1
  else:
    unknown.append(word)
    count += 1

In [13]:
print(1-(count/vocab_size))

0.2648725212464589


In [16]:
model = Sequential()
model.add(Embedding(tokens,200,weights = [embedding_matrix],input_length = embedding_matrix.shape[1]))
model.add(LSTM(64))
model.add(Flatten())
model.add(Dense(1,activation='sigmoid'))
model.compile(optimizer='adam',loss = 'binary_crossentropy',metrics=['accuracy'])
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 200, 200)          282800    
_________________________________________________________________
lstm_1 (LSTM)                (None, 64)                67840     
_________________________________________________________________
flatten_1 (Flatten)          (None, 64)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 65        
Total params: 350,705
Trainable params: 350,705
Non-trainable params: 0
_________________________________________________________________
None


In [18]:
model.fit(X_train,y_train,epochs=30)
loss,accuracy = model.evaluate(X_train,y_train)
print(f'acc: {accuracy}')

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
acc: 0.957430362701416


In [19]:
predictions = model.predict(X_test)
predictions = np.round(predictions)

In [20]:
from sklearn.metrics import accuracy_score, confusion_matrix
score = accuracy_score(y_test,predictions)
cm = confusion_matrix(y_test,predictions)
print("score: {} cm: {}".format(score,cm))

score: 0.8077369439071567 cm: [[1100  284]
 [ 213  988]]
