# MediaEval 2022 - Task 1

In [None]:
%%javascript
$('<div id="toc"></div>').css({position: 'fixed', top: '120px', left: 0}).appendTo(document.body);
$.getScript('https://kmahelona.github.io/ipython_notebook_goodies/ipython_notebook_toc.js');


# Imports

In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0" # the GPU on robinson

In [None]:
import numpy as np
import pandas as pd
from collections import deque
import random
import copy

import torch
import sklearn.metrics as metrics
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertForPreTraining, BertModel, AutoTokenizer, BertForSequenceClassification, RobertaForSequenceClassification
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.optim import AdamW
from gensim.models import KeyedVectors


from tqdm.notebook import tqdm, trange

import emoji
from nltk.corpus import stopwords

random_seed = 0
torch.manual_seed(random_seed)
random.seed(random_seed)
np.random.seed(random_seed)

conspiracies = ['Suppressed Cures',
     'Behaviour and Mind Control',
     'Antivax',
     'Fake virus',
     'Intentional Pandemic',
     'Harmful Radiation/ Influence',
     'Population reduction',
     'New World Order',
     'Satanism']

# Flags

In [None]:
# model_name = 'twitter'
model_name = 'twitter'

replace_lowercase_flag = False
remove_stopwords_flag = False
remove_hashtags_flag = True
replace_emojis_flag = True
clean_tweets_flag = False

all_data = False

classification = True

# fold
k=4

# Utils

In [None]:
def clean_tweets(tweets):
    char_to_remove = ['\n', '\xa0']
    corona_synonyms = ['coronavirus',
                      'covid-19',
                      'covid19',
                      'covid 19',
                      'covid',
                      'corona',
                      'sarscov2'
                      'sars',
                      'Coronaviruses',
                      'Coronavirus',
                      'Corona',
                      'Covid19',
                      'COVID19',
                      'Covid-19',
                      'COVID-19',
                      'COVID 19',
                      'Covid',
                      'COVID',
                      'SARSCOV2',
                      'SARS']
    
    tweets_clean = []
    for tw in tweets:
        for c in char_to_remove:
            tw = tw.replace(c, '')
        tw = tw.replace('&amp;', '&')
        
        for syn in corona_synonyms:
            if syn in tw:
                tw = tw.replace(syn, 'virus')
        tweets_clean.append(tw)
    return tweets_clean

def extract_hashtags(tweet):
    # Returns hashtags in a list for a given tweet
    
    #tweet = tweet.replace('\xa0','')
    #tweet = tweet.replace('\n','')
    
    tweet_words = tweet.split(' ')
    tweet_words = [w for w in tweet_words if w!='']
    hashtags = []
    for word in tweet_words:
        if word[0]=='#':
            hashtags.append(word)
    return hashtags

def extract_emojis (tw):
    # Returns emojis in a list for a given tweet
    # Using Deque for a sliding window (emojis can be combined together to form other emojis)
    
    emojis = []
    
    l = []
    max_l = 7
    
    for i in range(0, max_l):
        l.append(tw[-1-i])
    l = deque(l, maxlen=max_l)
    skip=0
    
    for i in range (0, len(tw)):
        if skip == 0:
            for j in range (max_l-1, -1, -1):
                str_to_test = ''
                for k in range (0, j+1):
                    str_to_test+=l[j-k]
                if str_to_test in emoji.UNICODE_EMOJI:
                    
                    emojis.append(str_to_test)
                    skip=j
                    break
            try:
                l.append(tw[-1-i-max_l])
            except IndexError:
                l.append('')
        else:
            skip=skip-1
            try:
                l.append(tw[-1-i-max_l])
            except IndexError:
                l.append('')
    emojis.reverse()
    return emojis

In [None]:
def to_lowercase(tweets):
    tweets_lowercase = []
    for tw in tweets:
        tweets_lowercase.append(tw.lower())
    return tweets_lowercase

def remove_stopwords(tweets):
    
    stop_words = set(stopwords.words('english'))
    
    tweets_no_stopwords = []
    for tw in tweets:
        tw = tw.split(' ')
        tweets_no_stopwords.append(' '.join([word for word in tw if not word in stop_words]))

    return tweets_no_stopwords

def remove_hashtags(tweets):
    tweets = [tw.replace('#', '') for tw in tweets]
    return tweets

def replace_emojis(tweets):
    tweets_no_emojis = []
    for tw in tweets:
        emojis = extract_emojis(tw)
        for e in emojis:
            e_text = emoji.UNICODE_EMOJI[e].replace('_',' ').replace(':', '')
            tw = tw.replace(e, e_text)
        tweets_no_emojis.append(tw)

    return tweets_no_emojis
    
        

# Load Data

In [None]:
data_path = '../../../mediaeval22/'
filelist = os.listdir(data_path)


df_list = [pd.read_csv(data_path+file) for file in filelist if 'fold' in file]


#test_df = df_list[k]    
train_df = pd.concat(df_list[:k]+df_list[k+1:])

test_df = pd.read_csv('../../../mediaeval22/task_3_test.csv')


tw_train = train_df['tweet_text'].tolist()
tw_test = test_df['tweet_text'].tolist()
ids_test = test_df['tweet_id'].tolist()


if all_data:
    df = pd.read_csv(data_path+'task_3_dev.csv')
    tw_train = df['tweet']
    labels_train = df.iloc[:,1:10].values.tolist()

if clean_tweets_flag:
    tw_train = clean_tweets(tw_train)
    tw_test = clean_tweets(tw_test)

if replace_lowercase_flag:
    tw_train = to_lowercase(tw_train)
    tw_test = to_lowercase(tw_test)

if remove_stopwords_flag:
    tw_train = remove_stopwords(tw_train)
    tw_test = remove_stopwords(tw_test)

if remove_hashtags_flag:
    tw_train = remove_hashtags(tw_train)
    tw_test = remove_hashtags(tw_test)

if replace_emojis_flag:
    tw_train = replace_emojis(tw_train)
    tw_test = replace_emojis(tw_test)


if not all_data:
    labels_train = train_df.iloc[:,1:10].values.tolist()
#labels_test = test_df.iloc[:,1:10].values.tolist()

labels_train = [[l-1 for l in L] for L in labels_train]
#labels_test = [[l-1 for l in L] for L in labels_test]

user_ids_train = train_df['user_id'].tolist()
user_ids_test = test_df['user_id'].tolist()

weights_tmp = [0,0,0,0,0,0,0,0,0]
for i in range(0, 9):
    for j in range(0, len(labels_train)):
        if labels_train[j][i]>0:
            weights_tmp[i]+=1
        
weights_inter_conspiracies = [len(labels_train)/w for w in weights_tmp]
#weights_inter_conspiracies is no longer used

weights_inter_conspiracies = torch.FloatTensor(weights_inter_conspiracies).to(device)

weights_intra_conspiracy = [[len(l)/l.count(j) for j in range(0, 3)] for l in [[k[i] for k in labels_train] for i in range(0, 9)]]
weights_intra_conspiracy = torch.FloatTensor(weights_intra_conspiracy).to(device)
#weights_intra_conspiracy represent the different weights loss for the different conspiracy theories

weights_inter_conspiracies, weights_intra_conspiracy 


In [None]:
W = np.array(labels_train).flatten().tolist()
weights_intra = [W.count(0), W.count(1), W.count(2)]
weights_intra = sum(weights_intra)/torch.FloatTensor(weights_intra).cuda()
weights_intra = weights_intra_conspiracy.mean(dim=0)
weights_intra

In [None]:
tokenizer = AutoTokenizer.from_pretrained('digitalepidemiologylab/covid-twitter-bert')

tokenized_input = tokenizer(tw_train)

m = 0
for tokens in tokenized_input['input_ids']:
    if len(tokens)>m:
        m=len(tokens)
m

In [None]:
MAX_LEN = 128 # < m some tweets will be truncated

tokenized_input = tokenizer(tw_train, max_length=MAX_LEN, padding='max_length', truncation=True)
tokenized_test = tokenizer(tw_test, max_length=MAX_LEN, padding='max_length', truncation=True)


train_input_ids, train_token_type_ids, train_attention_mask = tokenized_input['input_ids'], tokenized_input['token_type_ids'], tokenized_input['attention_mask']
test_input_ids, test_token_type_ids, test_attention_mask = tokenized_test['input_ids'], tokenized_test['token_type_ids'], tokenized_test['attention_mask']

train_token_type_ids = torch.tensor(train_token_type_ids)
test_token_type_ids = torch.tensor(test_token_type_ids)
    
    
train_labels = labels_train
#test_labels = labels_test


# Convert to torch tensor
train_input_ids = torch.tensor(train_input_ids)
train_labels = torch.tensor(train_labels)
train_attention_mask = torch.tensor(train_attention_mask)
user_ids_train = torch.Tensor(user_ids_train)

test_input_ids = torch.tensor(test_input_ids)
#test_labels = torch.tensor(test_labels)
test_attention_mask = torch.tensor(test_attention_mask)
test_ids = torch.tensor(ids_test)
user_ids_test = torch.Tensor(user_ids_test)


In [None]:
batch_size = 12 #

train_data = TensorDataset(train_input_ids, train_attention_mask, train_labels, train_token_type_ids, user_ids_train)
test_data = TensorDataset(test_input_ids, test_attention_mask, test_token_type_ids, test_ids, user_ids_test)

    
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

# Models

In [None]:
class CovidTwitterBertClassifier(nn.Module):
    
    def __init__(self, n_classes):
        super().__init__()
        self.n_classes = n_classes
        self.bert = BertForPreTraining.from_pretrained('digitalepidemiologylab/covid-twitter-bert-v2')    
        self.bert.cls.seq_relationship = nn.Linear(1024, n_classes)

        self.sigmoid = nn.Sigmoid()
        
    def forward(self, input_ids, token_type_ids, input_mask):
        outputs = self.bert(input_ids = input_ids, token_type_ids = token_type_ids, attention_mask = input_mask)

        logits = outputs[1]
        
        return logits  


In [None]:
!ls /data/peskine/mediaeval22/models/

In [None]:
class LearnedFeaturesFusion(nn.Module):
    
    def __init__(self):
        super().__init__()
        
        self.text_model = CovidTwitterBertClassifier(9*3)
        self.text_model.load_state_dict(torch.load('../../../mediaeval22/models/task1_twitter_CV4_e24_0.725.pth'))
        
        self.graph_model = KeyedVectors.load_word2vec_format("../../../mediaeval22/user_graph_w2v_d32_model.bin")
        
        self.classifier = nn.Sequential(
            nn.Linear(32+27, 32),
            nn.Dropout(p=0.1),
            nn.ReLU(),
            nn.Linear(32, 32),
            nn.Dropout(p=0.1),
            nn.ReLU(),
            nn.Linear(32, 27)
        )
        
    def forward(self, input_ids, token_type_ids, input_mask, user_ids):
        features_text = self.text_model(input_ids, token_type_ids, input_mask)
        features_graph = torch.Tensor(self.graph_model[[str(int(uid)) for uid in user_ids.tolist()]]).to(device)
        features = torch.cat([features_text, features_graph], dim=1)
        
        logits = self.classifier(features)
        
        return logits
    

In [None]:
model = LearnedFeaturesFusion()
    
model.cuda()

In [None]:
for param in model.parameters():
    param.requires_grad = False

for param in model.classifier.parameters():
    param.requires_grad = True
    
# only require grad for the classification head

In [None]:
optimizer = AdamW(model.parameters(),
                  lr=5e-3,
                  weight_decay = 0.01)

scheduler = ReduceLROnPlateau(optimizer, patience=4, factor=0.3)

In [None]:
criterions = []

for i in range(0, 9):
    criterions.append(nn.CrossEntropyLoss(weight = weights_intra_conspiracy[i]))
    

# Training

In [None]:
epochs = 25

threshold = 0.5

best_MCCA = 0
best_F1 = 0
best_MCCs = []
best_MCCNC = 0
best_loss = 999
best_acc = 0
best_state_dict = model.state_dict()
best_epoch = 0
best_MCCs = []
best_MCCs_task2 = 0
best_MCCA_task2 = 0
best_MCC_task1 = 0

for e in trange(0, epochs, position=0, leave=True):

    # Training
    print("Starting epoch ", e)

    model.train()

    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0

    for step, batch in enumerate(train_dataloader):

        batch = tuple(t.to(device) for t in batch)

        b_input_ids, b_input_mask, b_labels, b_token_type_ids, b_user_ids = batch
        
        b_labels = b_labels.float()
        optimizer.zero_grad()
        
        logits = model(b_input_ids, b_token_type_ids, b_input_mask, b_user_ids)
        
        losses = []
        for i in range(0, 9):
            logits_i = logits[:,3*i:3*i+3]
            labels_i = b_labels[:, i].long()
            loss_i = criterions[i](logits_i, labels_i)
            losses.append(loss_i*weights_inter_conspiracies[i])
        #loss = [losses[i]*weights_inter_conspiracies[i] for i in range(0, len(losses))]
        loss = sum(losses)
        loss.backward()
        optimizer.step()

        tr_loss += loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1

    print("Train loss: {}".format(tr_loss/nb_tr_steps))
    
    
    # Testing
    
    model.eval()
    
    tweets_test = []
    
    predictions_sep = [[], [], [], [], [], [], [], [], []]
    predictions_task1 = []
    
    labels_sep = [[], [], [], [], [], [], [], [], []]
    labels_task1 = []
    
    eval_loss = 0
    steps=0
    for step, batch in enumerate(test_dataloader):

        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)

        b_input_ids, b_input_mask, b_labels, b_token_type_ids, ids, b_user_ids = batch
            
        b_labels = b_labels.float()
        
        with torch.no_grad():

            logits = model(b_input_ids, b_token_type_ids, b_input_mask, b_user_ids)
            losses = []
            for i in range(0, 9):
                logits_i = logits[:,3*i:3*i+3]
                labels_i = b_labels[:, i].long()
                loss_i = criterions[i](logits_i, labels_i)
                losses.append(loss_i*weights_inter_conspiracies[i])
            #loss = [losses[i]*weights_inter_conspiracies[i] for i in range(0, len(losses))]
            loss = sum(losses)
    


        logits = logits.detach().cpu().numpy()
        ground_truth = b_labels.detach().cpu().numpy()
        
        steps+=1
        eval_loss+=loss.detach().item()
        
        tweets_test.append(b_input_ids)
        for i in range(0, len(logits)):
            p = logits[i]
            l = ground_truth[i]

            predictions_task1.append(max([p[3*i: 3*i+3].argmax() for i in range(0,9)]))
            labels_task1.append(l.max())
            
        for i in range(0, 9):
            for p in logits:
                p_i = p[3*i:3*i+3]
                pred = np.argmax(p_i)
                predictions_sep[i].append(pred)
            for l in ground_truth:
                labels_sep[i].append(l[i])
            
    MCCs = []
    for i in range(0, 9):
        MCCs.append(round(metrics.matthews_corrcoef(labels_sep[i], predictions_sep[i]), 3))

    
    scheduler.step(eval_loss/steps)
    LOSS = eval_loss/steps
    MCCA = np.mean(MCCs)
    
    if MCCA> best_MCCA:
        best_MCCA = MCCA
        best_loss = LOSS
        best_MCCs = MCCs
        best_state_dict = copy.deepcopy(model.state_dict())
        best_epoch = e
    
    print("\t Eval loss: {}".format(LOSS))
    print("\t Eval MCC for task 1: {}".format(MCC_task1))
    print("\t Eval MCCA: {}".format(MCCA))
    print("\t Eval MCCs: {}".format(MCCs))
    print("---"*25)
    print("\n")

In [None]:
best_MCCA, best_epoch

In [None]:
torch.save(best_state_dict, '../../../mediaeval22/models/task3_'+model_name+'_CV'+str(k)+'_e'+str(best_epoch)+'_'+str(round(best_MCCA, 3))+'.pth')


# Inference

In [None]:
!ls ../../../mediaeval22/models/

In [None]:
model.load_state_dict(torch.load('../../../mediaeval22/models/task3_twitter_CV4_e16_0.705.pth'))
model.eval()

In [None]:
model.eval()

tweets_test = []

predictions_sep = [[], [], [], [], [], [], [], [], []]
predictions_task1 = []

labels_sep = [[], [], [], [], [], [], [], [], []]
labels_task1 = []

eval_loss = 0
steps=0
for step, batch in enumerate(test_dataloader):

    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)

    b_input_ids, b_input_mask, b_token_type_ids, ids, b_user_ids = batch

    #b_labels = b_labels.float()

    with torch.no_grad():

        logits = model(b_input_ids, b_token_type_ids, b_input_mask, b_user_ids)
        #losses = []
        #for i in range(0, 9):
        #    logits_i = logits[:,3*i:3*i+3]
        #    labels_i = b_labels[:, i].long()
        #    loss_i = criterions[i](logits_i, labels_i)
        #    losses.append(loss_i*weights_inter_conspiracies[i])
        #loss = [losses[i]*weights_inter_conspiracies[i] for i in range(0, len(losses))]
        #loss = sum(losses)



    logits = logits.detach().cpu().numpy()
    #ground_truth = b_labels.detach().cpu().numpy()

    steps+=1
    #eval_loss+=loss.detach().item()

    tweets_test.append(b_input_ids)
    for i in range(0, len(logits)):
        p = logits[i]
        #l = ground_truth[i]

        predictions_task1.append(max([p[3*i: 3*i+3].argmax() for i in range(0,9)]))
        #labels_task1.append(l.max())

    for i in range(0, 9):
        for p in logits:
            p_i = p[3*i:3*i+3]
            pred = np.argmax(p_i)
            predictions_sep[i].append(pred)
        #for l in ground_truth:
        #    labels_sep[i].append(l[i])


In [None]:
sub_df = pd.DataFrame()
sub_df['-1'] = test_df['tweet_id'].tolist()
for i in range(0, 9):
    sub_df[i]=[j+1 for j in predictions_sep[i]]
    
sub_df