In [1]:
import json

import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
import glob
import os
list1=[] 
list2=[]

# tweet objects were gotten from scraping 
for filename in glob.glob('tweet-objects\*'):
    with open(os.path.join(os.getcwd(), filename), 'r') as f:
        data = json.load(f)
        list1.append(data['text']) 

with open('./project-data/train.label.txt') as f:
    reviews = f.readlines()
    for x in reviews:
        if 'nonrumour' in x:
            list2.append(0)
        else:
            list2.append(1) #rumor or not


In [3]:
sentences=list1
labels=list2

training_size = int(len(sentences) * 0.8)

training_sentences = sentences[0: training_size]
testing_sentences = sentences[: training_size]

training_labels = labels[0: training_size]
testing_labels = labels[: training_size]

training_labels_final = np.array(training_labels).astype('float32').reshape((-1,1))
testing_labels_final = np.array(testing_labels).astype('float32').reshape((-1,1))

In [4]:
#params
vocab_size = 8159
embedding_dim = 128
max_length = 280
trunc_type='post' #or pre 
padding_type='post'
oov_tok = "<OOV>"

In [5]:
#tokenisation and padding
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

tokenized_tweets=[]
filtered_sentence=[]
tokenizer = Tokenizer(num_words=None, filters='I"#$%&()*+,-./:;<=>?@[\J^_(I]N\t\n', lower=True, split=' ')
tokenizer.fit_on_texts(training_sentences)
for words in training_sentences:
    tokenized_tweets.append(words.lower())
for w in tokenized_tweets:
    if w not in stop_words:
        filtered_sentence.append(w)

training_sequences = tokenizer.texts_to_sequences(filtered_sentence) #vectorizing words
training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [6]:
#building the model
model = tf.keras.models.Sequential()

model.add(tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length))
model.add(tf.keras.layers.Bidirectional(
    tf.keras.layers.LSTM(embedding_dim,
                         return_sequences=True)
))
#model.add(tf.keras.layers.Dense(6, activation='relu'))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

In [7]:
model.compile(loss='binary_crossentropy',
              optimizer=tf.keras.optimizers.Adam(0.01),
              metrics=['accuracy'])

In [8]:
callbacks = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss', min_delta=0, patience=2, verbose=0, mode='auto',
    baseline=None, restore_best_weights=False
)
num_epochs=7
modelo = model.fit(training_padded, training_labels_final, epochs=num_epochs, validation_data=(testing_padded, testing_labels_final),
          callbacks=[callbacks])

Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


In [9]:
for filename in glob.glob('covid-objs\*'):
    with open(os.path.join(os.getcwd(), filename), 'r') as f:
        data = json.load(f)
        list1.append(data['text']) 
        
padding_type='post'
sample_sequences = tokenizer.texts_to_sequences(list1)
fakes_padded = pad_sequences(sample_sequences, padding=padding_type, maxlen=max_length)           

classes = model.predict(fakes_padded)
classes=classes.round()
#print(classes)

final=[]
for nums in classes:
    for n in nums:
        final.append(n)

In [None]:
import csv

rumour=['rumour']
nonrumour=['nonrumour']
with open('./new4.csv', 'w',newline='', encoding='UTF8') as f:
    writer = csv.writer(f)
    for f in final:      
        if f==1:
            writer.writerow(rumour)
        else:
            writer.writerow(nonrumour)