In [None]:
import torch
import numpy as np
import pandas as pd
from simpletransformers.classification import ClassificationModel
import re
import sklearn

torch.manual_seed(1525)
np.random.seed(1525)

In [None]:
#loading english data
import pickle as pkl
with open('./resources/covid_en_tweet.pickle', 'rb') as pkl_in:
    tweets_en = pkl.load(pkl_in)
#loading bengali data
with open('./resources/covid_bn_tweet.pickle', 'rb') as pkl_in:
    tweets_bn = pkl.load(pkl_in)
#loading hindi data
with open('./resources/covid_hi_tweet.pickle', 'rb') as pkl_in:
    tweets_hi = pkl.load(pkl_in)

In [None]:
#train - test split
def split(df):
    df_copy = df.copy()
    train_set = df_copy.sample(frac=0.80, random_state=0)
    print(len(train_set), train_set.head())
    test_set_split = df_copy.drop(train_set.index)
    #print('-------', len(train_set.index), len(df_copy), len(df_copy) - len(train_set.index), len(test_set))
    '''eval_set = test_set_split.sample(frac=0.50, random_state=0)
    print(len(eval_set), eval_set.head())
    test_set_split = test_set_split.drop(eval_set.index)
    print(len(test_set_split), test_set_split.head())'''
    return train_set, test_set_split
    #, eval_set,

In [None]:
#simple text based classification
#very useful library : https://towardsdatascience.com/simple-transformers-introducing-the-easiest-bert-roberta-xlnet-and-xlm-library-58bf8c59b2a3
def fake_classify(train_set, test_set):

    # Create a TransformerModel


    model = ClassificationModel('bert', 'bert-base-multilingual-uncased', args={ 'num_train_epochs': 3, 'overwrite_output_dir': True, 'manual_seed' : 1525}, use_cuda = False)

    # Train the model

    model.train_model(train_set)

    # Evaluate the model

    result, model_outputs, wrong_predictions = model.eval_model(test_set, f1=sklearn.metrics.f1_score, acc=sklearn.metrics.accuracy_score)
    
    
    return model, result, model_outputs, wrong_predictions

In [None]:
def results(result):
    prec = result['tp']/(result['tp'] + result['fp'])
    rec = result['tp']/(result['tp'] + result['fn'])
    fscore = (2*prec*rec)/(prec + rec)
    print('Raw result = ', result)
    print('Precision = ', prec )
    print('Recall = ', rec)
    print('F-Score = ', fscore) 
    return fscore

In [None]:
path_en = './resources/en_model'
path_bn = './resources/bn_model'
path_hi = './resources/hi_model'
path_multi = './resources/multi_model'

In [None]:
del tweets_en['text_info']
df_en = pd.DataFrame(tweets_en)
print(df_en.head())
#train_set_en, eval_set_en, test_set_en = split(df_en)

In [None]:
#classification on bengali tweets
del tweets_bn['text_info']
df_bn = pd.DataFrame(tweets_bn)
print(df_bn)

In [None]:
#classification on hindi tweets
del tweets_hi['text_info']
df_hi = pd.DataFrame(tweets_hi)
print(df_hi)

In [None]:
with open('./resources/covid_bn_tweet_test.pickle', 'rb') as pkl_in:
    tweets_bn_test = pkl.load(pkl_in)
del tweets_bn_test['text_info']
df_bn_test = pd.DataFrame(tweets_bn_test)
df_bn_test

In [None]:
#multilingual model
frames = [df_en, df_bn, df_hi, df_bn_test]
df_merged = pd.concat(frames)
df_merged.index = range(len(df_merged))   #change indices
df_merged

In [None]:
#multilingual results
from sklearn.model_selection import KFold
import torch

best_result = 0
kf = KFold(n_splits=5)
model_outputs_multi = {}
count = 1
for train, test in kf.split(df_merged):
    print('--------------------------', count, '------------------------------')
    #print("%s %s" % (train, test))
    df_train_multi = df_merged.copy()
    df_test_multi = df_merged.copy()
    df_train_multi = df_train_multi.drop(test)
    df_test_multi = df_test_multi.drop(train)
    print(len(df_train_multi), len(df_test_multi))
    
    model_multi, result_multi, model_outputs_multi, wrong_predictions_multi = fake_classify(df_train_multi, df_test_multi)
    
    fscr = results(result_multi)
    if fscr > best_result:
        best_result = fscr
        torch.save(model_multi, path_multi)
    
    model_outputs_multi[count] = {}
    model_outputs_multi[count]['indices'] = test
    model_outputs_multi[count]['outputs'] = model_outputs_multi 

In [None]:
#storing model outputs of mono and multilingual models
with open('./resources/multi_raw_outputs.pickle', 'wb') as pkl_out:
    pkl.dump(model_outputs_multi, pkl_out)

In [None]:
def preprocess(tweet):
    tweet = tweet.lower()
    url = r'http\S+'
    tweet = re.sub(url, 'URL', tweet, flags=re.MULTILINE)
    emoji = re.compile("["         u"\U0001F600-\U0001F64F"  # emoticons
                                   u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                                   u"\U0001F680-\U0001F6FF"  # transport & map symbols
                                   u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                                   u"\U00002500-\U00002BEF"  # chinese char
                                   u"\U00002702-\U000027B0"
                                   u"\U00002702-\U000027B0"
                                   u"\U000024C2-\U0001F251"
                                   u"\U0001f926-\U0001f937"
                                   u"\U00010000-\U0010ffff"
                                   u"\u2640-\u2642"
                                   u"\u2600-\u2B55"
                                   u"\u200d"
                                   u"\u23cf"
                                   u"\u23e9"
                                   u"\u231a"
                                   u"\ufe0f"  # dingbats
                                   u"\u3030"
                                   "]+", flags=re.UNICODE)
    tweet =  emoji.sub(r'', tweet)
    tweet = ' '.join([word[1:] if word[0] == '#' else word for word in tweet.split()])
    return tweet

In [None]:
#getting predictions on real tweets
def predict(path, sent):
    model = torch.load(path)
    sent = preprocess(sent)
    p, ro = model.predict([sent])
    c1 = np.exp(ro[0][0])/sum([np.exp(val) for val in ro[0]])
    c2 = np.exp(ro[0][1])/sum([np.exp(val) for val in ro[0]])
    result = 'This tweet has a verifiable claim.' if p[0] == 1 else 'This tweet does not have a verifiable claim.'
    cscore = c2*100 if p[0] == 1 else c1*100
    print(sent, ' : ', result)
    print('The model says this with a',round(cscore, 2), '% confidence score.')

In [None]:
#predict english tweets
sent = input()
predict(path_en, sent)

In [None]:
#predict english tweets
sent = input()
predict(path_en, sent)

In [None]:
#predict bengali tweets
sent = input()
predict(path_bn, sent)

In [None]:
#predict hindi tweets #example from BBC News Hindi
sent = input()
predict(path_hi, sent)

In [None]:
#predict multilingual tweets #example from DW Bangla account
sent = input()
predict(path_multi, sent)