In [1]:
import torch
import numpy as np
import pandas as pd
from simpletransformers.classification import ClassificationModel
import re
import sklearn

torch.manual_seed(1525)
np.random.seed(1525)

In [2]:
#loading english data
import pickle as pkl
with open('./resources/covid_en_tweet.pickle', 'rb') as pkl_in:
    tweets = pkl.load(pkl_in)
#loading bengali data
with open('./resources/covid_bn_tweet.pickle', 'rb') as pkl_in:
    tweets_bn = pkl.load(pkl_in)
#loading hindi data
with open('./resources/covid_hi_tweet.pickle', 'rb') as pkl_in:
    tweets_hi = pkl.load(pkl_in)

In [3]:
#train - test split
def split(df):
    df_copy = df.copy()
    train_set = df_copy.sample(frac=0.80, random_state=0)
    print(len(train_set), train_set.head())
    test_set_split = df_copy.drop(train_set.index)
    #print('-------', len(train_set.index), len(df_copy), len(df_copy) - len(train_set.index), len(test_set))
    eval_set = test_set_split.sample(frac=0.50, random_state=0)
    print(len(eval_set), eval_set.head())
    test_set_split = test_set_split.drop(eval_set.index)
    print(len(test_set_split), test_set_split.head())
    return train_set, eval_set, test_set_split

In [4]:
del tweets['text_info']
df = pd.DataFrame(tweets)
print(df.head())
train_set, eval_set, test_set = split(df)

                                                text  labels
0  for the average american the best way to tell ...       0
1                           this is fucking bullshit       0
2  can y ’ all please just follow the government ...       0
3  no offense but the corona virus disappearing b...       0
4  this is the face of someone who just spent  9 ...       1
403                                                   text  labels
90   one of my coworkers died today from covid19 . ...       1
97   this is my mom . she ’ s my hero . she ’ s a  ...       1
476  y all got us fucked up ! ! keep that shit in s...       0
340  north americans : * hoarding toilet paper , vo...       1
395  the first silicon valley death from co vid  - ...       1
50                                                   text  labels
131  a message from the ministry of information “ t...       1
450  hello ,  2  5 th . â   heâ   s definitely ...       0
28               day  5 : we have rediscovered farming      

In [5]:
#simple text based classification
#very useful library : https://towardsdatascience.com/simple-transformers-introducing-the-easiest-bert-roberta-xlnet-and-xlm-library-58bf8c59b2a3
def fake_classify(train_set, eval_set, test_set, path):

    # Create a TransformerModel


    model = ClassificationModel('bert', 'bert-base-multilingual-uncased', args={ 'evaluate_during_training': True, 'num_train_epochs': 3, 'overwrite_output_dir': True, 'manual_seed' : 1525}, use_cuda = False)

    # Train the model

    model.train_model(train_set, eval_df=eval_set)

    # Evaluate the model

    result, model_outputs, wrong_predictions = model.eval_model(test_set, f1=sklearn.metrics.f1_score, acc=sklearn.metrics.accuracy_score)
    
    #save the model
    
    import torch
    torch.save(model, path)

    return result, model_outputs, wrong_predictions

In [6]:
def results(result):
    prec = result['tp']/(result['tp'] + result['fp'])
    rec = result['tp']/(result['tp'] + result['fn'])
    fscore = (2*prec*rec)/(prec + rec)
    print('Raw result = ', result)
    print('Precision = ', prec )
    print('Recall = ', rec)
    print('F-Score = ', fscore) 


In [7]:
path_en = './resources/en_model'
path_bn = './resources/bn_model'
path_hi = './resources/hi_model'
path_multi = './resources/multi_model'

In [9]:
#english results
result, model_outputs, wrong_predictions = fake_classify(train_set, eval_set, test_set, path_en)
print('--------------------------------')
print('Classification Results : ')
results(result)

HBox(children=(FloatProgress(value=0.0, max=403.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=3.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0', max=51.0, style=ProgressStyle(descripti…

Running loss: 0.503382



Running loss: 0.335193




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 1', max=51.0, style=ProgressStyle(descripti…

Running loss: 0.804690


HBox(children=(FloatProgress(value=0.0, description='Running Epoch 2', max=51.0, style=ProgressStyle(descripti…

Running loss: 0.925106



HBox(children=(FloatProgress(value=0.0, max=51.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Evaluation', max=7.0, style=ProgressStyle(descrip…


--------------------------------
Classification Results : 
Raw result =  {'mcc': 0.5633927053243971, 'tp': 24, 'tn': 16, 'fp': 7, 'fn': 4, 'f1': 0.8135593220338982, 'acc': 0.7843137254901961, 'eval_loss': 0.7761548986392361}
Precision =  0.7741935483870968
Recall =  0.8571428571428571
F-Score =  0.8135593220338982


#### Results :
-------------------------------------------------------------------------------------------------------Prec----Recall-----Fscore
1. With my preprocessing + 1 epoch on Covid19 dataset + no validation = 67.60,  87.27,     76.19
2. With almost infodemic preprocessing + 3 epochs + validation = 75.86, 78.57, 77.19

In [9]:
#classification on bengali tweets
del tweets_bn['text_info']
df_bn = pd.DataFrame(tweets_bn)
print(df_bn)
train_set_bn, eval_set_bn, test_set_bn = split(df_bn)

                                                  text  labels
0    গড় আমেরিকানদের কাছে আপনার কোভিড -১৯ আছে কিনা ...       0
1                                   এই বকশি বাজানো হয়       0
2    আপনি কি দয়া করে কেবলমাত্র সরকারের নির্দেশাবলী...       0
3    কোনও অপরাধ নয় তবে এপ্রিলের আগে সফল হওয়ার আগে...       0
4    লন্ডনের আশেপাশের গুরুতর অসুস্থ কোভিড ১৯ রোগীদে...       1
..                                                 ...     ...
423  যদি এই টুইটটিতে 10 কেটি আরটি হয়, আমি লিন উপত্...       0
426  আমাদের কীভাবে সম্ভাব্য covid-19 ক্ষেত্রে যোগায...       0
427  মিডিয়া কর্নাভাইরাস পেয়ে কনজারভেটিভদের উদযাপন...       1
429  যদি করোনা এখানে কেবলমাত্র লোকেরা পাচ্ছে তবে .....       0
436  তুমি কর. আমি শুনেছি এটি শূকরদের একটি বৃহতভাবে ...       0

[380 rows x 2 columns]
304                                                   text  labels
165  এটিকে উদারপন্থী গণমাধ্যমে একটি প্রেসিডেন্সিয়া...       1
338                চাইনিজ করোনার ভাইরাস URL এর মতো হোন       0
305  breaking: আমি জরুরী ঘর

In [11]:
result_bn, model_outputs, wrong_predictions = fake_classify(train_set_bn, eval_set_bn, test_set_bn, path_bn)
print('--------------------------------')
print('Classification Results : ')
results(result_bn)

HBox(children=(FloatProgress(value=0.0, max=304.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=3.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0', max=38.0, style=ProgressStyle(descripti…

Running loss: 0.622902




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 1', max=38.0, style=ProgressStyle(descripti…

Running loss: 0.481021



Running loss: 0.467411


HBox(children=(FloatProgress(value=0.0, description='Running Epoch 2', max=38.0, style=ProgressStyle(descripti…

Running loss: 0.054094



HBox(children=(FloatProgress(value=0.0, max=38.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Evaluation', max=5.0, style=ProgressStyle(descrip…


--------------------------------
Classification Results : 
Raw result =  {'mcc': 0.3967876966218537, 'tp': 11, 'tn': 15, 'fp': 3, 'fn': 9, 'f1': 0.6470588235294117, 'acc': 0.6842105263157895, 'eval_loss': 0.7761805176734924}
Precision =  0.7857142857142857
Recall =  0.55
F-Score =  0.6470588235294117


In [10]:
#classification on bengali tweets
del tweets_hi['text_info']
df_hi = pd.DataFrame(tweets_hi)
print(df_hi)
train_set_hi, eval_set_hi, test_set_hi = split(df_hi)

                                                  text  labels
0    औसत अमेरिकी के लिए यह बताने का सबसे अच्छा तरीक...       0
1                                          यह बकवास है       0
2    क्या आप कृपया सरकार के निर्देशों का पालन कर सक...       0
3    कोई अपराध नहीं है लेकिन अप्रैल से पहले गायब हो...       0
4    यह किसी ऐसे व्यक्ति का चेहरा है जिसने लंदन के ...       1
..                                                 ...     ...
409  just in: राष्ट्रपति डॉटरे ने 10 मार्च से 14 मा...       1
415  breaking news: मेयर जॉय बेलमोंटे के अनुसार, qu...       1
416   इटली में हमने एक भयानक गलती की, हर कोई कहता र...       1
418  अपने होम्स के चारों ओर एक लड़की लाएँ और अचानक ...       0
437  कोरोना oppa pls उर दुनिया दौरे hyung यू को आरा...       0

[349 rows x 2 columns]
279                                                   text  labels
6    covid-19 के खत्म होने के बाद, मुझे बेहतर है कि...       0
54   वाह ... आज सुबह .... कभी नहीं देखा वेगास tbh स...       0
306                        

In [13]:
result_hi, model_outputs, wrong_predictions = fake_classify(train_set_hi, eval_set_hi, test_set_hi, path_hi)
print('--------------------------------')
print('Classification Results : ')
results(result_hi)

HBox(children=(FloatProgress(value=0.0, max=279.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=3.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0', max=35.0, style=ProgressStyle(descripti…

Running loss: 0.655572




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 1', max=35.0, style=ProgressStyle(descripti…

Running loss: 0.309549



Running loss: 0.269389


HBox(children=(FloatProgress(value=0.0, description='Running Epoch 2', max=35.0, style=ProgressStyle(descripti…

Running loss: 0.495954



HBox(children=(FloatProgress(value=0.0, max=35.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Evaluation', max=5.0, style=ProgressStyle(descrip…


--------------------------------
Classification Results : 
Raw result =  {'mcc': 0.22420074340397805, 'tp': 18, 'tn': 5, 'fp': 8, 'fn': 4, 'f1': 0.7500000000000001, 'acc': 0.6571428571428571, 'eval_loss': 0.7776981234550476}
Precision =  0.6923076923076923
Recall =  0.8181818181818182
F-Score =  0.7500000000000001


In [11]:
#multilingual model
frames = [df, df_bn, df_hi]
df_merged = pd.concat(frames)
df_merged.index = range(len(df_merged))   #change indices
train_set_merge, eval_set_merge, test_set_merge = split(df_merged)

986                                                   text  labels
18   maybe if i develop feelings for co vid  -  19 ...       0
342  bigoted statements which spread misinformation...       1
467  dude said this like he has all the infinity st...       0
852  করোনা একটি কালো আলো এবং আমেরিকা একটি বাঁধা হোট...       0
980  दोस्त। covid-19 के कारण लौटने वाले प्रकृति उपच...       1
124                                                   text  labels
444  the country is panic stricken over the  corona...       1
591  ইয়া আল্লাহ, আপনি ইতিমধ্যে তাদেরকে বিশ্বজুড়ে ...       0
341  our  flatten the curve graphic is now up on @ ...       1
594  মিচ covid-19 বিলে দ্বিপক্ষীয় কাজ পরিত্যাগ করে...       1
324  @waltshaub please repost this . it shares vita...       0
123                                                  text  labels
11  so , the last week i have been battling co vid...       1
23  bro china is out here reading niggas power lev...       0
24  when this corona shit is over please invit

In [12]:
result_multi, model_outputs, wrong_predictions = fake_classify(train_set_merge, eval_set_merge, test_set_merge, path_multi)
print('--------------------------------')
print('Classification Results : ')
results(result_multi)

HBox(children=(FloatProgress(value=0.0, max=986.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=3.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0', max=124.0, style=ProgressStyle(descript…

Running loss: 0.724259



Running loss: 0.473049




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 1', max=124.0, style=ProgressStyle(descript…

Running loss: 0.412328


HBox(children=(FloatProgress(value=0.0, description='Running Epoch 2', max=124.0, style=ProgressStyle(descript…

Running loss: 0.013812



HBox(children=(FloatProgress(value=0.0, max=123.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Evaluation', max=16.0, style=ProgressStyle(descri…


--------------------------------
Classification Results : 
Raw result =  {'mcc': 0.6276807925343189, 'tp': 69, 'tn': 33, 'fp': 12, 'fn': 9, 'f1': 0.8679245283018868, 'acc': 0.8292682926829268, 'eval_loss': 0.5542476478149183}
Precision =  0.8518518518518519
Recall =  0.8846153846153846
F-Score =  0.8679245283018868


In [14]:
#storing model outputs of mono and multilingual models
with open('./resources/multi_raw_outputs.pickle', 'wb') as pkl_out:
    pkl.dump(model_outputs, pkl_out)
'''with open('./resources/en_raw_outputs.pickle', 'wb') as pkl_out:
    pkl.dump(en_model_outputs, pkl_out)
with open('./resources/bn_raw_outputs.pickle', 'wb') as pkl_out:
    pkl.dump(bn_model_outputs, pkl_out)
with open('./resources/hi_raw_outputs.pickle', 'wb') as pkl_out:
    pkl.dump(hi_model_outputs, pkl_out)'''

"with open('./resources/en_raw_outputs.pickle', 'wb') as pkl_out:\n    pkl.dump(en_model_outputs, pkl_out)\nwith open('./resources/bn_raw_outputs.pickle', 'wb') as pkl_out:\n    pkl.dump(bn_model_outputs, pkl_out)\nwith open('./resources/hi_raw_outputs.pickle', 'wb') as pkl_out:\n    pkl.dump(hi_model_outputs, pkl_out)"

In [15]:
def preprocess(tweet):
    tweet = tweet.lower()
    url = r'http\S+'
    tweet = re.sub(url, 'URL', tweet, flags=re.MULTILINE)
    emoji = re.compile("["         u"\U0001F600-\U0001F64F"  # emoticons
                                   u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                                   u"\U0001F680-\U0001F6FF"  # transport & map symbols
                                   u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                                   u"\U00002500-\U00002BEF"  # chinese char
                                   u"\U00002702-\U000027B0"
                                   u"\U00002702-\U000027B0"
                                   u"\U000024C2-\U0001F251"
                                   u"\U0001f926-\U0001f937"
                                   u"\U00010000-\U0010ffff"
                                   u"\u2640-\u2642"
                                   u"\u2600-\u2B55"
                                   u"\u200d"
                                   u"\u23cf"
                                   u"\u23e9"
                                   u"\u231a"
                                   u"\ufe0f"  # dingbats
                                   u"\u3030"
                                   "]+", flags=re.UNICODE)
    tweet =  emoji.sub(r'', tweet)
    tweet = ' '.join([word[1:] if word[0] == '#' else word for word in tweet.split()])
    return tweet

In [16]:
#getting predictions on real tweets
def predict(path, sent):
    model = torch.load(path)
    sent = preprocess(sent)
    p, ro = model.predict([sent])
    c1 = np.exp(ro[0][0])/sum([np.exp(val) for val in ro[0]])
    c2 = np.exp(ro[0][1])/sum([np.exp(val) for val in ro[0]])
    result = 'This tweet has a verifiable claim.' if p[0] == 1 else 'This tweet does not have a verifiable claim.'
    cscore = c2*100 if p[0] == 1 else c1*100
    print(sent, ' : ', result)
    print('The model says this with a',round(cscore, 2), '% confidence score.')

In [17]:
#predict english tweets
sent = input()
predict(path_en, sent)

Our govt has pro-actively strengthened the medical infrastructure in the NE States to fight #COVID19. All states have a low number of cases. As on date, the active cases are 3731, the recoveries exceed this at 5715. No mortality in Manipur, Mizoram, Nagaland & Sikkim. @MoHFW_INDIA


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


our govt has pro-actively strengthened the medical infrastructure in the ne states to fight covid19. all states have a low number of cases. as on date, the active cases are 3731, the recoveries exceed this at 5715. no mortality in manipur, mizoram, nagaland & sikkim. @mohfw_india  :  This tweet has a verifiable claim.
The model says this with a 96.9 % confidence score.


In [18]:
#predict english tweets
sent = input()
predict(path_en, sent)

Does taking a hot bath prevent #COVID19    NO. But daily bath is recommended to maintain good hygiene which may indirectly reduce the risk of many infections. #StaySafe #IndiaFightsCorona


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


does taking a hot bath prevent covid19 no. but daily bath is recommended to maintain good hygiene which may indirectly reduce the risk of many infections. staysafe indiafightscorona  :  This tweet has a verifiable claim.
The model says this with a 96.69 % confidence score.


In [19]:
#predict bengali tweets
sent = input()
predict(path_bn, sent)

করোনায় ১৩ ভাগ চাকরিজীবী বেকার হয়েছেন: বিআইডিএস https://p.dw.com/p/3eOdh #Bangladesh #coronavirus


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


করোনায় ১৩ ভাগ চাকরিজীবী বেকার হয়েছেন: বিআইডিএস URL bangladesh coronavirus  :  This tweet does not have a verifiable claim.
The model says this with a 85.46 % confidence score.


In [None]:
#predict hindi tweets #example from BBC News Hindi
sent = input()
predict(path_hi, sent)

In [20]:
#predict multilingual tweets #example from DW Bangla account
sent = input()
predict(path_multi, sent)

করোনায় ১৩ ভাগ চাকরিজীবী বেকার হয়েছেন: বিআইডিএস https://p.dw.com/p/3eOdh #Bangladesh #coronavirus


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


করোনায় ১৩ ভাগ চাকরিজীবী বেকার হয়েছেন: বিআইডিএস URL bangladesh coronavirus  :  This tweet has a verifiable claim.
The model says this with a 98.65 % confidence score.
