# Covid19 Tweet Truth Analysis

late fusion version of 4 features:
 1. tf-idf
 2. word2vec
 3. glove
 4. fine-tuning Bert

In [1]:
# setup CUDA
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla P100-PCIE-16GB


# Data preprocess

This dataset contains the training, validation, and test csv's, along with excel documents for the train and test files, a csv with the test file actual values, and ERNIE test results. For this analysis, I will be ignoring the excel files (as they are the same as the csv's) and the ERNIE results. I will be acting as if the test answer file did not exist for the duration of the testing phase as well, thus sticking with a basic approach of train, validate, see what the model decides for the tests.

In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import nltk #Natural Language Toolkit for Processing
from nltk.corpus import stopwords #Get the Stopwords to Remove

import re #Regular Expressions
import html #Messing with HTML content, like &amp;
import string #String Processing

import tensorflow as tf #Import tensorflow in order to use Keras
from tensorflow.keras.preprocessing.text import Tokenizer #Add the keras tokenizer for tweet tokenization
from tensorflow.keras.preprocessing.sequence import pad_sequences #Add padding to help the Keras Sequencing
import tensorflow.keras.layers as L #Import the layers as L for quicker typing
from tensorflow.keras.optimizers import Adam #Pull the adam optimizer for usage

from tensorflow.keras.losses import SparseCategoricalCrossentropy #Loss function being used
from sklearn.model_selection import train_test_split #Train Test Split

In [3]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [4]:
import os
from google.colab import drive

drive.mount('/content/gdrive')
path = "/content/gdrive/My Drive/PLP_sharing/project/fake_news"
os.chdir(path)
os.listdir('./')

Mounted at /content/gdrive


['covid19-fake-news-dataset-nlp.zip',
 'covid19-fake-news-dataset-nlp-unzip',
 'BiLSTM.ipynb',
 'bert',
 'Transformer-Explainability',
 'lqq_transformer1',
 'lqq_transformer2',
 'w2c-glove',
 'Fake Detection.gdoc',
 'late_fusion',
 'tfidf',
 'early_fusion',
 'word2vec_tokenizer.pickle',
 'glove_tokenizer.pickle']

In [5]:
twTrain = pd.read_csv("./covid19-fake-news-dataset-nlp-unzip/Constraint_Train.csv") #Load the tweet (tw) training set
twTrain.head() #Take a peek at the data

Unnamed: 0,id,tweet,label
0,1,The CDC currently reports 99031 deaths. In gen...,real
1,2,States reported 1121 deaths a small rise from ...,real
2,3,Politically Correct Woman (Almost) Uses Pandem...,fake
3,4,#IndiaFightsCorona: We have 1524 #COVID testin...,real
4,5,Populous states can generate large case counts...,real


In [6]:
twValid = pd.read_csv("./covid19-fake-news-dataset-nlp-unzip/Constraint_Val.csv") #Load the tweet (tw) validation set
twValid.head() #Take a peek at the data

Unnamed: 0,id,tweet,label
0,1,Chinese converting to Islam after realising th...,fake
1,2,11 out of 13 people (from the Diamond Princess...,fake
2,3,"COVID-19 Is Caused By A Bacterium, Not Virus A...",fake
3,4,Mike Pence in RNC speech praises Donald Trump’...,fake
4,5,6/10 Sky's @EdConwaySky explains the latest #C...,real


---

## Clean Tweets

In [7]:
punctuations = string.punctuation #List of punctuations to remove
print(punctuations) #See the punctuations the string library has

STOP = stopwords.words("english") #Get the NLTK stopwords
print(STOP) #See what NLTK considers stopwords

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only

In [8]:
def cleanTweets(tweetParse):
    for i in range(0,len(tweetParse)):
        tweet = tweetParse[i] #Putting the tweet into a variable so that it is not calling tweetParse[i] over and over
        tweet = html.unescape(tweet) #Removes leftover HTML elements, such as &amp;
        tweet = re.sub(r"@\w+", " ", tweet) #Completely removes @'s, as other peoples' usernames mean nothing
        tweet = re.sub(r"http\S+", " ", tweet) #Removes links, as links provide no data in tweet analysis in themselves
        
        tweet = "".join([punc for punc in tweet if not punc in punctuations]) #Removes the punctuation defined above
        tweet = tweet.lower() #Turning the tweets lowercase real quick for later use
    
        tweetWord = tweet.split() #Splits the tweet into individual words
        tweetParse[i] = "".join([word + " " for word in tweetWord if not word in STOP]) #Checks if the words are stop words
        
    return tweetParse #Returns the parsed tweets

In [9]:
twTrain["cleanTweet"] = cleanTweets(twTrain["tweet"].copy()) #Clean the training tweets
twValid["cleanTweet"] = cleanTweets(twValid["tweet"].copy()) #Clean the validation tweets

twTrain.head() #Take a look at the dataset

Unnamed: 0,id,tweet,label,cleanTweet
0,1,The CDC currently reports 99031 deaths. In gen...,real,cdc currently reports 99031 deaths general dis...
1,2,States reported 1121 deaths a small rise from ...,real,states reported 1121 deaths small rise last tu...
2,3,Politically Correct Woman (Almost) Uses Pandem...,fake,politically correct woman almost uses pandemic...
3,4,#IndiaFightsCorona: We have 1524 #COVID testin...,real,indiafightscorona 1524 covid testing laborator...
4,5,Populous states can generate large case counts...,real,populous states generate large case counts loo...


## Label Encoding

Interestingly, the get_dummies function in pandas will create encoded labels, since this is a binary classification problem. The real column created by it would have 1 for real and 0 for not real, which necessarily means fake in this case. That is the same as label encoding in this case.

In [10]:
dummyTrain = pd.get_dummies(twTrain["label"]) #Get the dummies for the training set
print(dummyTrain) #Show the dummies

      fake  real
0        0     1
1        0     1
2        1     0
3        0     1
4        0     1
...    ...   ...
6415     1     0
6416     1     0
6417     1     0
6418     1     0
6419     0     1

[6420 rows x 2 columns]


That real column shows the encoded values for real vs fake. I will be taking the real column as the encoded values.

In [11]:
twTrain["encodedLabel"] = dummyTrain["real"].astype('int') #Get the encoded labels from the "real" dummies
twValid["encodedLabel"] = pd.get_dummies(twValid["label"])["real"].astype('int') #Get the encoded labels for the validation set

twTrain.head() #Take a peek at the data

Unnamed: 0,id,tweet,label,cleanTweet,encodedLabel
0,1,The CDC currently reports 99031 deaths. In gen...,real,cdc currently reports 99031 deaths general dis...,1
1,2,States reported 1121 deaths a small rise from ...,real,states reported 1121 deaths small rise last tu...,1
2,3,Politically Correct Woman (Almost) Uses Pandem...,fake,politically correct woman almost uses pandemic...,0
3,4,#IndiaFightsCorona: We have 1524 #COVID testin...,real,indiafightscorona 1524 covid testing laborator...,1
4,5,Populous states can generate large case counts...,real,populous states generate large case counts loo...,1


In [12]:
train_X = twTrain["cleanTweet"]   # '0' refers to the review text
train_y = twTrain["encodedLabel"]   # '1' corresponds to Label (1 - positive and 0 - negative)
test_X = twValid['cleanTweet']
test_y = twValid["encodedLabel"]

# Model1: TF-IDF

In [13]:
import pickle

with open('./tfidf/tfidf.pickle', 'rb') as f:
    saved_tf_idf = pickle.load(f)

with open('./tfidf/tfidf_lstm.pkl', 'rb') as f:
    saved_lstm = pickle.load(f)


test_text_fake = 'Alfalfa is the only cure for COVID-19.'
test_text_real = '#IndiaFightsCorona India has one of the lowest #COVID19 mortality globally with less than 2% Case Fatality Rate. As a result of supervised home isolation &amp; effective clinical treatment many States/UTs have CFR lower than the national average. https://t.co/QLiK8YPP7E'

cleaned = cleanTweets([test_text_fake, test_text_real])
test_tfidf = saved_tf_idf.transform(cleaned).toarray()
x_test = test_tfidf.reshape(-1,1,200)

print(x_test.shape, x_test.dtype)

#0= Fake news
#1= Real news
preds = (saved_lstm.predict(x_test).ravel()>0.5)+0
for res in preds:
  if res==1:
    print("Real Covid News")
  elif res==0:
    print("Fake Covid News")

print(preds)

(2, 1, 200) float64
Fake Covid News
Real Covid News
[0 1]


In [None]:
from sklearn import metrics

cleaned = cleanTweets(test_X.values.tolist())
test_tfidf = saved_tf_idf.transform(cleaned).toarray()
x_test = test_tfidf.reshape(-1,1,200)

y_pred = (saved_lstm.predict(x_test).ravel()>0.5)+0 # predict and get class (0 if pred < 0.5 else 1)
print(metrics.classification_report(test_y, y_pred, target_names=['Fake', 'Real']))

              precision    recall  f1-score   support

        Fake       0.85      0.87      0.86      1020
        Real       0.88      0.86      0.87      1120

    accuracy                           0.87      2140
   macro avg       0.87      0.87      0.87      2140
weighted avg       0.87      0.87      0.87      2140



# Model2: Word2Vec

In [14]:
!pip install gensim



In [15]:
X_train_2 = twTrain["cleanTweet"]   # '0' refers to the review text
y_train_2 = twTrain["encodedLabel"]   # '1' corresponds to Label (1 - positive and 0 - negative)
X_test_2 = twValid['cleanTweet']
y_test_2 = twValid["encodedLabel"]

In [16]:
from gensim.models import Word2Vec

Embedding_dimensions = 200

#Creating Word2Vec training dataset.
Word2vec_train_data = list(map(lambda x: x.split(), X_train_2))

word2vec_model = Word2Vec(Word2vec_train_data,
                 size=Embedding_dimensions,
                 workers=8,
                 min_count=5)

print("Vocabulary Length:", len(word2vec_model.wv.vocab))

Vocabulary Length: 3181


In [17]:
corpus = []
for i in range(0, len(X_train_2)):
  corpus.append(X_train_2[i])

In [18]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from gensim.models import KeyedVectors

input_length = 200

tokenizer = Tokenizer(filters="", lower=False, oov_token="<oov>")
tokenizer.fit_on_texts(corpus)

vocab_length = len(tokenizer.word_index) + 1
print("Tokenizer vocab length:", vocab_length)

X_train_2 = pad_sequences(tokenizer.texts_to_sequences(X_train_2), maxlen=input_length, dtype='float32')
X_test_2  = pad_sequences(tokenizer.texts_to_sequences(X_test_2) , maxlen=input_length, dtype='float32')

print("X_train.shape:", X_train_2.shape)
print("X_test.shape :", X_test_2.shape)

embedding_matrix = np.zeros((vocab_length, Embedding_dimensions))
for word, token in tokenizer.word_index.items():
  if word2vec_model.wv.__contains__(word):
      embedding_matrix[token] = word2vec_model.wv.__getitem__(word)
print("Embedding Matrix Shape:", embedding_matrix.shape)   

Tokenizer vocab length: 16538
X_train.shape: (6420, 200)
X_test.shape : (2140, 200)
Embedding Matrix Shape: (16538, 200)


In [19]:
# saving
with open('word2vec_tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [28]:
from keras.layers import Embedding
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout, GlobalMaxPool1D

embedding_layer = Embedding(input_dim = vocab_length, 
                                output_dim = Embedding_dimensions,
                                weights=[embedding_matrix], 
                                input_length=input_length,
                                trainable=False)
base_model_2 = Sequential()
base_model_2.add(embedding_layer)
base_model_2.add(LSTM(128,return_sequences=True))
base_model_2.add(LSTM(64,return_sequences=True))
base_model_2.add(LSTM(32))
base_model_2.add(Dense(8, activation='relu'))
base_model_2.add(Dense(1, activation='sigmoid'))
base_model_2.compile(loss='binary_crossentropy', optimizer=tf.keras.optimizers.Adam(0.001), metrics=['accuracy'])
base_model_2.summary()


Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 200, 200)          3307600   
                                                                 
 lstm_5 (LSTM)               (None, 200, 128)          168448    
                                                                 
 lstm_6 (LSTM)               (None, 200, 64)           49408     
                                                                 
 lstm_7 (LSTM)               (None, 32)                12416     
                                                                 
 dense_4 (Dense)             (None, 8)                 264       
                                                                 
 dense_5 (Dense)             (None, 1)                 9         
                                                                 
Total params: 3,538,145
Trainable params: 230,545
Non-

In [29]:
d = base_model_2.fit(X_train_2, y_train_2, batch_size=64, epochs=10, verbose=1, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [30]:
pred=(base_model_2.predict(X_test_2) > 0.5).astype("int32")

In [31]:
from sklearn import metrics

print(metrics.classification_report(y_test_2, pred, target_names=['Fake', 'Real']))

              precision    recall  f1-score   support

        Fake       0.77      0.85      0.81      1020
        Real       0.85      0.77      0.81      1120

    accuracy                           0.81      2140
   macro avg       0.81      0.81      0.81      2140
weighted avg       0.81      0.81      0.81      2140



In [23]:
import pickle
# now you can save it to a file
with open('./late_fusion/word2vec.pkl', 'wb') as f:
    pickle.dump(base_model_2, f)



INFO:tensorflow:Assets written to: ram://7493baba-7ae0-4aeb-a9eb-afc44cabff69/assets


INFO:tensorflow:Assets written to: ram://7493baba-7ae0-4aeb-a9eb-afc44cabff69/assets


In [30]:
with open('./late_fusion/word2vec.pkl', 'rb') as f:
    saved_base_model_2 = pickle.load(f)

test_text_fake = 'Alfalfa is the only cure for COVID-19.'
test_text_real = '#IndiaFightsCorona India has one of the lowest #COVID19 mortality globally with less than 2% Case Fatality Rate. As a result of supervised home isolation &amp; effective clinical treatment many States/UTs have CFR lower than the national average. https://t.co/QLiK8YPP7E'

test_input_2  = pad_sequences(tokenizer.texts_to_sequences([test_text_fake, test_text_real]) , maxlen=input_length, dtype='float32')
pred_test=(base_model_2.predict(test_input_2) > 0.5).astype("int32")
pred_test

array([[0],
       [1]], dtype=int32)

# Model3: Glove

In [32]:
X_train_3 = twTrain["cleanTweet"]   # '0' refers to the review text
y_train_3 = twTrain["encodedLabel"]   # '1' corresponds to Label (1 - positive and 0 - negative)
X_test_3 = twValid['cleanTweet']
y_test_3 = twValid["encodedLabel"]

In [33]:
# Maximum number of words to be embedded
NUM_WORDS = 30000
EMBEDDING_DIM=100
# max length to be encoded for a sentence
MAX_SEQUENCE_LENGTH = 200

# Define Tokenize text function
tokenizer = Tokenizer(num_words=NUM_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n\'', lower=True)
# Fit the function on the text
tokenizer.fit_on_texts(X_train_3)
sequences = tokenizer.texts_to_sequences(X_train_3)

# pad the data to the same length
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post')

# Count number of unique tokens
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 16536 unique tokens.


In [69]:
# saving tokenizer
with open('glove_tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [34]:
word_vectors = dict()

# load the whole embedding into memory
f = open('./w2c-glove/glove.6B.100d.txt', encoding="utf8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    word_vectors[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(word_vectors))

Loaded 400000 word vectors.


In [35]:
vocabulary_size=min(len(word_index)+1,(NUM_WORDS))
#vocabulary_size= len(word_index)+1
embedding_matrix = np.zeros((vocabulary_size, EMBEDDING_DIM))

for word, i in word_index.items():
    if i>=NUM_WORDS:
        continue
    try:
        embedding_vector = word_vectors[word]
        embedding_matrix[i] = embedding_vector
    except KeyError:
      # if the word is not found, set to all 0s
        vec = np.zeros(EMBEDDING_DIM)
        embedding_matrix[i]=vec


In [36]:
embedding_layer = Embedding(vocabulary_size,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

In [37]:
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, Bidirectional, GlobalMaxPool1D
from keras.models import Sequential, Model

inputs = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='float32')
embed_input = embedding_layer(inputs)

x = Bidirectional(LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(embed_input)
x = GlobalMaxPool1D()(x)
x = Dense(50, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(1, activation="sigmoid")(x)
base_model_3 = Model(inputs=inputs,outputs=x)
base_model_3.summary() 

x3 = base_model_3.output

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 200)]             0         
                                                                 
 embedding_4 (Embedding)     (None, 200, 100)          1653700   
                                                                 
 bidirectional (Bidirectiona  (None, 200, 100)         60400     
 l)                                                              
                                                                 
 global_max_pooling1d_1 (Glo  (None, 100)              0         
 balMaxPooling1D)                                                
                                                                 
 dense_6 (Dense)             (None, 50)                5050      
                                                                 
 dropout_1 (Dropout)         (None, 50)                0     

In [38]:
base_model_3.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
history = base_model_3.fit(data, y_train_3, epochs=2, verbose=1)

Epoch 1/2
Epoch 2/2


In [40]:
from sklearn import metrics

sequences = tokenizer.texts_to_sequences(X_test_3)
test_input_3 = pad_sequences(sequences, maxlen=200, padding='post')
pred= (base_model_3.predict(test_input_3) > 0.5).astype("int32")
print(metrics.classification_report(y_test_3, pred, target_names=['Fake', 'Real']))

              precision    recall  f1-score   support

        Fake       0.86      0.91      0.89      1020
        Real       0.92      0.87      0.89      1120

    accuracy                           0.89      2140
   macro avg       0.89      0.89      0.89      2140
weighted avg       0.89      0.89      0.89      2140



In [41]:
import pickle
# now you can save it to a file
with open('./late_fusion/glove.pkl', 'wb') as f:
    pickle.dump(base_model_3, f)

INFO:tensorflow:Assets written to: ram://48000818-2f4f-4536-87b8-21ac990901c0/assets




In [48]:
with open('./late_fusion/glove.pkl', 'rb') as f:
    saved_base_model_3 = pickle.load(f)

test_text_fake = 'Alfalfa is the only cure for COVID-19.'
test_text_real = '#IndiaFightsCorona India has one of the lowest #COVID19 mortality globally with less than 2% Case Fatality Rate. As a result of supervised home isolation &amp; effective clinical treatment many States/UTs have CFR lower than the national average. https://t.co/QLiK8YPP7E'

sequences = tokenizer.texts_to_sequences([test_text_fake, test_text_real])
test_input_3 = pad_sequences(sequences, maxlen=200, padding='post')

pred_test=(saved_base_model_3.predict(test_input_3) > 0.5).astype("int32")
pred_test













array([[0],
       [1]], dtype=int32)

# Model4: Bert

In [43]:
!pip install transformers
!pip install azureml-core

Collecting transformers
  Downloading transformers-4.17.0-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 8.5 MB/s 
Collecting tokenizers!=0.11.3,>=0.11.1
  Downloading tokenizers-0.11.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.5 MB)
[K     |████████████████████████████████| 6.5 MB 41.2 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 5.9 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 55.8 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 49.9 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Fo

In [51]:
import torch

base_model_4 = torch.load('./lqq_transformer1/model')

In [None]:
from transformers import BertTokenizer

test_text_fake = 'Alfalfa is the only cure for COVID-19.'
test_text_real = '#IndiaFightsCorona India has one of the lowest #COVID19 mortality globally with less than 2% Case Fatality Rate. As a result of supervised home isolation &amp; effective clinical treatment many States/UTs have CFR lower than the national average. https://t.co/QLiK8YPP7E'

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

test_text = tokenizer(list([test_text_fake, test_text_real]), 
                          max_length = 128,           # Pad & truncate all sentences.
                          padding = 'max_length',
                          truncation=True,
                          return_attention_mask = True,   # Construct attn. masks.
                          return_tensors = 'pt'     # Return pytorch tensors.
)
test_seq = torch.tensor(test_text['input_ids']).to('cuda:0')
test_mask = torch.tensor(test_text['attention_mask']).to('cuda:0')

with torch.no_grad():
  outputs = base_model_4(test_seq, test_mask) # reference: https://www.kaggle.com/akshat0007/bert-for-sequence-classification
  pred_proba = outputs[0].detach().cpu().numpy()

preds = np.argmax(pred_proba, axis = 1)

print([preds.tolist(), pred_proba.tolist()])

[[0, 1], [[0.6307932734489441, -0.8636711239814758], [-2.8603923320770264, 2.058260202407837]]]


  from ipykernel import kernelapp as app
  app.launch_new_instance()


In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

test_text = tokenizer(list(test_X.values.tolist()), 
                          max_length = 128,           # Pad & truncate all sentences.
                          padding = 'max_length',
                          truncation=True,
                          return_attention_mask = True,   # Construct attn. masks.
                          return_tensors = 'pt'     # Return pytorch tensors.
)
# val_dataset = TensorDataset(encoded_textsValid['input_ids'], encoded_textsValid['attention_mask'], labelsValid)
test_seq = torch.tensor(test_text['input_ids']).to('cuda:0')
test_mask = torch.tensor(test_text['attention_mask']).to('cuda:0')

with torch.no_grad():
  outputs = base_model_4(test_seq, test_mask) # reference: https://www.kaggle.com/akshat0007/bert-for-sequence-classification
  pred_proba = outputs[0].detach().cpu().numpy()

preds = np.argmax(pred_proba, axis = 1)

  # This is added back by InteractiveShellApp.init_path()
  if sys.path[0] == '':


In [None]:
from sklearn import metrics
print(metrics.classification_report(test_y, preds, target_names=['Fake', 'Real']))

              precision    recall  f1-score   support

        Fake       0.91      0.95      0.93      1020
        Real       0.95      0.92      0.94      1120

    accuracy                           0.93      2140
   macro avg       0.93      0.93      0.93      2140
weighted avg       0.93      0.93      0.93      2140



# Do Ensemble

In [45]:
from transformers import BertTokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import torch

# loading models
# Model 1
with open('./tfidf/tfidf.pickle', 'rb') as f:
    saved_tf_idf = pickle.load(f)
with open('./tfidf/tfidf_lstm.pkl', 'rb') as f:
    saved_lstm = pickle.load(f)
# Model 2
with open('word2vec_tokenizer.pickle', 'rb') as handle:
    tokenizer_2 = pickle.load(handle)
with open('./late_fusion/word2vec.pkl', 'rb') as f:
    saved_base_model_2 = pickle.load(f)
# Model 3
with open('glove_tokenizer.pickle', 'rb') as handle:
    tokenizer_3 = pickle.load(handle)
with open('./late_fusion/glove.pkl', 'rb') as f:
    saved_base_model_3 = pickle.load(f)
# Model 4
saved_base_model_4 = torch.load('./lqq_transformer1/model')



test_text_fake = 'Alfalfa is the only cure for COVID-19.'

# Model 1
cleaned = cleanTweets(test_X.values.tolist())
test_tfidf = saved_tf_idf.transform(cleaned).toarray().reshape(-1,1,200)
preds_1 = (saved_lstm.predict(test_tfidf).ravel()>0.5)+0


# Model 2
test_input_2  = pad_sequences(tokenizer_2.texts_to_sequences(test_X) , maxlen=input_length, dtype='float32')
preds_2=(saved_base_model_2.predict(test_input_2) > 0.5).astype("int32")

# Model 3
test_input_3 = pad_sequences(tokenizer_3.texts_to_sequences(test_X), maxlen=200, padding='post')
preds_3 = (saved_base_model_3.predict(test_input_3) > 0.5).astype("int32")

# Model 4
tokenizer  = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
test_text = tokenizer(test_X.values.tolist(), 
                          max_length = 128,           # Pad & truncate all sentences.
                          padding = 'max_length',
                          truncation=True,
                          return_attention_mask = True,   # Construct attn. masks.
                          return_tensors = 'pt'     # Return pytorch tensors.
)
test_seq = torch.tensor(test_text['input_ids']).to('cuda:0')
test_mask = torch.tensor(test_text['attention_mask']).to('cuda:0')

with torch.no_grad():
  outputs = saved_base_model_4(test_seq, test_mask) # reference: https://www.kaggle.com/akshat0007/bert-for-sequence-classification
  pred_proba = outputs[0].detach().cpu().numpy()

preds_4 = np.argmax(pred_proba, axis = 1)


# Do voting
final_predict = [int(preds_1[i] * 0.2 + preds_2[i][0] * 0.1 + preds_3[i][0] * 0.3 + preds_4[i] * 0.4) for i in range(len(preds_4))]
# print(final_predict)

from sklearn import metrics
print(metrics.classification_report(test_y, final_predict, target_names=['Fake', 'Real']))













Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]



              precision    recall  f1-score   support

        Fake       0.75      0.99      0.85      1020
        Real       0.99      0.70      0.82      1120

    accuracy                           0.84      2140
   macro avg       0.87      0.85      0.84      2140
weighted avg       0.87      0.84      0.84      2140



# Conclution

The experiment result is as follows.

| Models   | Accuracy | Details |
|----------|----------|---------|
| TF-IDF   | 0.87     |         |
| Word2Vec | 0.81     |         |
| Glove    | 0.89     |         |
|Fine-tune BERT| 0.94 |         |
|Late fusion| 0.85 |         |
|Early fusion| 0.78 |         |

In [47]:
# Do voting
final_predict = [int(preds_1[i] * 0.1 + preds_2[i][0] * 0 + preds_3[i][0] * 0.1 + preds_4[i] * 0.8) for i in range(len(preds_4))]
# print(final_predict)

from sklearn import metrics
print(metrics.classification_report(test_y, final_predict, target_names=['Fake', 'Real']))

              precision    recall  f1-score   support

        Fake       0.80      0.98      0.88      1020
        Real       0.98      0.78      0.87      1120

    accuracy                           0.87      2140
   macro avg       0.89      0.88      0.87      2140
weighted avg       0.89      0.87      0.87      2140

