#**COVID 19 ENGLISH TWEET CLASSIFICATION**
Dataset link:
- [!gdown --id 1aqWjA4bbRdI_LtM_AWtgZMdmTbf-JO_i](https://)  #train_clean.tsv file
- [!gdown --id 11wYrZwPKkiZXprubFm2U-Td6NnmeKJKB](https://)  #val_clean.tsv file

#Embedding - **GLOVE Embedding**
#Model - **BiLSTM**

In [1]:
import requests, zipfile, io
zip_file_url = "http://nlp.stanford.edu/data/glove.6B.zip"
r = requests.get(zip_file_url)
z = zipfile.ZipFile(io.BytesIO(r.content))
z.extractall()

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os, re, csv, math, codecs

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer #word stemmer class
lemma = WordNetLemmatizer()

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Bidirectional, LSTM, Dropout, BatchNormalization
from keras.layers.embeddings import Embedding

In [3]:
print('loading word embeddings...')
embeddings_index = dict()
f = codecs.open('glove.6B.300d.txt', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('found %s word vectors' % len(embeddings_index))


loading word embeddings...
found 400000 word vectors


In [4]:
# Let’s read train and val datasets.
!gdown --id 1aqWjA4bbRdI_LtM_AWtgZMdmTbf-JO_i #train_clean.tsv file
!gdown --id 11wYrZwPKkiZXprubFm2U-Td6NnmeKJKB #val_clean.tsv file

trainData = pd.read_csv("/content/train_clean.tsv",sep="\t",header=0)
valData = pd.read_csv("/content/valid_clean.tsv",sep="\t",header=0)

Label_dict = {
    'INFORMATIVE': 1,
    'UNINFORMATIVE': 0
}
trainData['Label'] = trainData.Label.replace(Label_dict)
valData['Label'] = valData.Label.replace(Label_dict)

Downloading...
From: https://drive.google.com/uc?id=1aqWjA4bbRdI_LtM_AWtgZMdmTbf-JO_i
To: /content/train_clean.tsv
100% 1.70M/1.70M [00:00<00:00, 53.3MB/s]
Downloading...
From: https://drive.google.com/uc?id=11wYrZwPKkiZXprubFm2U-Td6NnmeKJKB
To: /content/valid_clean.tsv
100% 245k/245k [00:00<00:00, 36.0MB/s]


In [5]:
trainData.shape,valData.shape

((7000, 3), (1000, 3))

In [6]:
list(trainData["Text"].head(20))

['Official death toll from covid19 in the United Kingdom is now GREATER than Germany  Poland  Switzerland  Austria  Portugal  Greece  Sweden  Finland  Norway  Ireland COMBINED UK 675 Million 233 dead Above group 185 Million 230 dead HTTPURL',
 'Dearest Mr President  1169 coronavirus deaths in the US in 24 hours  Covid19 pandemic is an international crime from China  not a nature disasster Please use your authorities to protect your people and world against China ChinaHasToCompensateAll',
 'Latest Updates March 20 5274 new cases and 38 new deaths in the United States Illinois Governo Pritzker issues stay at home order for all residents New York Governor Cuomo orders 100 of all nonessential workers to stay home PennsSource  coronaviruscountryus ',
 '  BREAKING 21 people on Grand Princess cruise ship docked off the California coast tested positive for coronavirus including 19 crew members and two passengers Vice Pres Mike Pence says 24 people tested negative HTTPURL HTTPURL',
 'OKLAHOMA C

In [7]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
nltk.download('punkt')
def clean_text(txt):
    """""
    cleans the input text in the following steps
    1- replace contractions
    2- removing punctuation
    3- spliting into words
    4- removing stopwords
    5- removing leftover punctuations
    """""
    contraction_dict = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not", "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",  "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would", "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",  "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have","you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have"}
    def _get_contractions(contraction_dict):
        contraction_re = re.compile('(%s)' % '|'.join(contraction_dict.keys()))
        return contraction_dict, contraction_re

    def replace_contractions(text):
        contractions, contractions_re = _get_contractions(contraction_dict)
        def replace(match):
            return contractions[match.group(0)]
        return contractions_re.sub(replace, text)

    # replace contractions
    txt = replace_contractions(txt)

    # Convert to lowercase
    txt = txt.lower()
    
    #remove punctuations
    txt = re.sub('[0-9]+', '', txt)
    
    # split into words
    from nltk.tokenize import word_tokenize
    words = word_tokenize(txt)
    
    # remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [w for w in words if not w in stop_words]
    
    # removing leftover punctuations
    words = [word for word in words if word.isalpha()]
    
    cleaned_text = ' '.join(words)
    return cleaned_text
    

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [8]:
trainData['Text'] = trainData.Text.apply(lambda txt: clean_text(txt))
display(trainData.head())

Unnamed: 0,Id,Text,Label
0,1.24149e+18,official death toll covid united kingdom great...,1
1,1.245916e+18,dearest mr president coronavirus deaths us hou...,1
2,1.241132e+18,latest updates march new cases new deaths unit...,1
3,1.236107e+18,breaking people grand princess cruise ship doc...,1
4,1.239674e+18,oklahoma city state department education annou...,0


In [9]:
valData['Text'] = valData.Text.apply(lambda txt: clean_text(txt))
display(valData.head())

Unnamed: 0,Id,Text,Label
0,1241728922192142336,saying pakistan italy weeks first confirmed ca...,0
1,1235713405992030209,second case dr canadian woman identified howev...,1
2,1245941302367305728,kill chain cyber war americas elections must s...,0
3,1245913002840391681,town hosts first virtual towncouncil meeting v...,0
4,1240543259299987457,report suggested actual number undiagnosed cor...,0


In [10]:
trainData.shape, valData.shape

((7000, 3), (1000, 3))

**Tokenize**

To apply the *glove embeddings*, we have to first convert our text to sequences. 

We can use *keras* to define a vocabulary in which each word will have a unique index. We will pad shorter sentences to the max length (length of longest tweet after preprocessing).

In [11]:
max_length = trainData.Text.apply(lambda x: len(x.split())).max()

t = Tokenizer()
t.fit_on_texts(trainData.Text)
vocab_size = len(t.word_index) + 1
encoded_tweets = t.texts_to_sequences(trainData.Text)
padded_tweets = pad_sequences(encoded_tweets, maxlen=max_length, padding='post')

vocab_size = len(t.word_index) + 1

In [12]:
vocab_size

20001

In [13]:
padded_tweets.shape

(7000, 47)

Now we map each unique word index with its *glove Vector*.



In [14]:
embedding_matrix = np.zeros((vocab_size, 300))
for word, i in t.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
embedding_matrix = pd.DataFrame(embedding_matrix)

In [15]:
embedding_matrix.shape

(20001, 300)

In [16]:
max_length2 = valData.Text.apply(lambda x: len(x.split())).max()

t2 = Tokenizer()
t2.fit_on_texts(valData.Text)
vocab_size2 = len(t.word_index) + 1
encoded_tweets2 = t.texts_to_sequences(valData.Text)
padded_tweets2 = pad_sequences(encoded_tweets2, maxlen=max_length2, padding='post')

vocab_size2 = len(t.word_index) + 1

In [17]:
padded_tweets2.shape

(1000, 36)

In [18]:
embedding_matrix2 = np.zeros((vocab_size2, 300))
for word, i in t.word_index.items():
    embedding_vector2 = embeddings_index.get(word)
    if embedding_vector2 is not None:
        embedding_matrix2[i] = embedding_vector2
embedding_matrix2 = pd.DataFrame(embedding_matrix2)

In [19]:
embedding_matrix2.shape

(20001, 300)

**Train the model**

In [20]:
xtrain_glove = padded_tweets
ytrain = trainData["Label"]
xvalid_glove = padded_tweets2
yvalid = valData["Label"]

In [21]:
print(xtrain_glove.shape, xvalid_glove.shape, ytrain.shape,yvalid.shape)

(7000, 47) (1000, 36) (7000,) (1000,)


In [22]:
model_glove = Sequential()
model_glove.add(Embedding(vocab_size, 300, input_length=max_length, weights=[embedding_matrix], trainable=True))
model_glove.add(Bidirectional(LSTM(20, return_sequences=True)))
model_glove.add(Dropout(0.2))
model_glove.add(BatchNormalization())
model_glove.add(Bidirectional(LSTM(20, return_sequences=True)))
model_glove.add(Dropout(0.2))
model_glove.add(BatchNormalization())
model_glove.add(Bidirectional(LSTM(20)))
model_glove.add(Dropout(0.2))
model_glove.add(BatchNormalization())
model_glove.add(Dense(64, activation='relu'))
model_glove.add(Dense(64, activation='relu'))
model_glove.add(Dense(1, activation='sigmoid'))
model_glove.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model_glove.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 47, 300)           6000300   
_________________________________________________________________
bidirectional (Bidirectional (None, 47, 40)            51360     
_________________________________________________________________
dropout (Dropout)            (None, 47, 40)            0         
_________________________________________________________________
batch_normalization (BatchNo (None, 47, 40)            160       
_________________________________________________________________
bidirectional_1 (Bidirection (None, 47, 40)            9760      
_________________________________________________________________
dropout_1 (Dropout)          (None, 47, 40)            0         
_________________________________________________________________
batch_normalization_1 (Batch (None, 47, 40)            1

**Fit Train Data**

In [23]:
xtrain_glove

array([[  378,    16,    74, ...,     0,     0,     0],
       [ 8758,  1095,   113, ...,     0,     0,     0],
       [  120,   167,    40, ...,     0,     0,     0],
       ...,
       [19993,   463,   896, ...,     0,     0,     0],
       [  421,     2,    41, ...,     0,     0,     0],
       [ 7536,   176,    45, ...,     0,     0,     0]], dtype=int32)

In [24]:
## Fit train data
model_glove.fit(xtrain_glove, ytrain, epochs = 10)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f78459e4208>

**Prediction**

In [25]:
yvalid.dtype

dtype('int64')

In [26]:
y_pred = model_glove.predict(xvalid_glove)
y_pred = [0 if i<0.5 else 1 for i in y_pred]
np.savetxt('/content/validpred_glove.txt',y_pred,delimiter=',')
print("Val Prediction File is ready!")

Val Prediction File is ready!


In [27]:
print(classification_report(yvalid, y_pred))

              precision    recall  f1-score   support

           0       0.79      0.86      0.82       528
           1       0.82      0.74      0.78       472

    accuracy                           0.80      1000
   macro avg       0.80      0.80      0.80      1000
weighted avg       0.80      0.80      0.80      1000



**Thank You**