# MediaEval 2022 - Task 2

In [None]:
%%javascript
$('<div id="toc"></div>').css({position: 'fixed', top: '120px', left: 0}).appendTo(document.body);
$.getScript('https://kmahelona.github.io/ipython_notebook_goodies/ipython_notebook_toc.js');


# Imports

In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="1" # the GPU on robinson


In [None]:

import numpy as np
import pandas as pd
from collections import deque
import random
import copy

import torch
import sklearn.metrics as metrics
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertForPreTraining, BertModel, AdamW, AutoTokenizer, BertForSequenceClassification, RobertaForSequenceClassification
from torch.optim.lr_scheduler import ReduceLROnPlateau

from sentence_transformers import SentenceTransformer, util

from tqdm.notebook import tqdm, trange
from sklearn.manifold import TSNE

from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis


import emoji
from nltk.corpus import stopwords

random_seed = 0
torch.manual_seed(random_seed)
random.seed(random_seed)
np.random.seed(random_seed)

import networkx as nx

from gensim.models.word2vec import Word2Vec
from gensim.models import KeyedVectors
import nodevectors

conspiracies = ['Suppressed Cures',
     'Behaviour and Mind Control',
     'Antivax',
     'Fake virus',
     'Intentional Pandemic',
     'Harmful Radiation/ Influence',
     'Population reduction',
     'New World Order',
     'Satanism']

# Data

In [None]:
#fold
k = 4

In [None]:
!ls /data/peskine/mediaeval22

In [None]:
data_path = '../../../mediaeval22/'
filelist = os.listdir(data_path)


df_list = [pd.read_csv(data_path+file) for file in filelist if 'fold' in file]


test_df = df_list[k]    
#test_df = pd.read_csv('/data/peskine/mediaeval22/task_2_test.csv')

train_df = pd.concat(df_list[:k]+df_list[k+1:])

In [None]:
train_x = torch.Tensor(train_df['user_id'].tolist())
train_y = torch.Tensor(train_df['user_class_label'].tolist())-1

test_x = torch.Tensor(test_df['user_id'].tolist())
#test_y = torch.Tensor(test_df['user_class_label'].tolist())-1

In [None]:
batch_size = 512
train_data = TensorDataset(train_x, train_y)
test_data = TensorDataset(test_x)


train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

# Graph creation

In [None]:
path='../../../mediaeval22/'
user_graph = pd.read_csv(path+'user_graph.csv')
user_info = pd.read_csv(path+'user_info.csv')
task_2_dev = pd.read_csv(path+'task_2_dev.csv')

In [None]:
user_info

In [None]:
user_graph

In [None]:
users_ids = user_df['user_id'].tolist()
users_labels = user_df['user_class_label'].tolist()

classes = ['', 'Normal User', 'Misinfo Spreader']

In [None]:
user_df

In [None]:
len(users_ids), len(users_labels)

In [None]:
tmp_dfi = user_graph[user_graph['i'].isin(users_ids)]
tmp_dfj = user_graph[user_graph['j'].isin(users_ids)]

tmp_df = pd.concat([tmp_dfi, tmp_dfj]).drop_duplicates()
tmp_df


In [None]:
# create networkx graph
# this may take a while, don't forget to save the results
G = nx.DiGraph()

for row in trange(0, len(user_graph)):
    i, j, w = user_graph.iloc[row]

    G.add_edge(i, j, weight=w)
    

In [None]:
!ls /data/peskine/mediaeval22

In [None]:
G = nx.read_gexf('../../../mediaeval22/user_graph.gexf')

In [None]:
len(G.nodes()), len(G.edges())

# Random Walks

In [None]:
from nodevectors import Node2Vec

In [None]:
graph = "mediaeval22"
r = 10
l = 40
p = 1
q = 0.5

In [None]:
g2v = Node2Vec(
    n_components=32,
    neighbor_weight=2,
    walklen=10
)


In [None]:
# creating random walks and train word2vec model.
# this may take a while
g2v.fit(G)

In [None]:
g2v.save('../../../mediaeval22/user_graph_w2v_d1024_model')

In [None]:
g2v.save_vectors("../../../mediaeval22/user_graph_w2v_d1024_model.bin")

In [None]:
from gensim.models import KeyedVectors
model = KeyedVectors.load_word2vec_format("../../../mediaeval22/user_graph_w2v_d32_model.bin")

# Visualization

These are tools to visualize the node embeddings (t-sne)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
df = pd.read_csv('../../../mediaeval22/task_3_dev.csv')


In [None]:
tmp_df = df[['user_id', 'user_class_label']]
user_df = tmp_df.drop_duplicates()

misinfo_users = user_df[user_df['user_class_label']==2]['user_id'].tolist()
normal_users = user_df[user_df['user_class_label']==1]['user_id'].tolist()

d = {}
X = []
y = []

user_ids = []

for u in misinfo_users:
    d[u] = model[str(u)]
    X.append(model[str(u)])
    y.append("Misinfo Spreader")
    user_ids.append(u)

for u in normal_users:
    d[u] = model[str(u)]
    X.append(model[str(u)])
    y.append("Normal User")
    user_ids.append(u)

In [None]:
tsne = TSNE(2)
tsne_result = tsne.fit_transform(X)

In [None]:
plt.rcParams['figure.figsize'] = [18, 18]
plt.rcParams.update({'font.size': 22})

In [None]:
len(misinfo_users)

In [None]:
misinfo_mean = tsne_result[:len(misinfo_users)].mean(axis=0)
normal_mean = tsne_result[len(misinfo_users):].mean(axis=0)


In [None]:
mean_df = pd.DataFrame({'tsne_1': [misinfo_mean[0], normal_mean[0]], 'tsne_2': [misinfo_mean[1], normal_mean[1]], 'label': ["Misinfo mean", "Normal mean"]})
mean_df

In [None]:
fig, ax = plt.subplots(1)

tsne_result_df = pd.DataFrame({'tsne_1': tsne_result[:,0], 'tsne_2': tsne_result[:,1], 'label': y})
g = sns.scatterplot(x='tsne_1', y='tsne_2', palette=['lightblue', 'orange'], hue='label', data=tsne_result_df, ax=ax,s=120)
g = sns.scatterplot(x='tsne_1', y='tsne_2', palette=['deepskyblue', 'darkorange'],  marker='*', hue='label', data=mean_df, ax=ax,s=3000, legend=False)

#plt.show(g)


# Entities

Are some entities mentioned more by misinfo spreader?

In [None]:
import spacy
import re

In [None]:
def normalize_text(text):
    text = text.replace('&amp;', '&')
    text = text.replace('\xa0', '')
    text = re.sub(r'http\S+', '', text)
    text = " ".join(text.split())
    return text
    

In [None]:
tweets = df['tweet_text'].tolist()

In [None]:
# spaCy entityLinker entities
nlp = spacy.load("en_core_web_sm")
nlp.add_pipe("entityLinker", last=True)

entities_tweets_spacy = []
ex_tweet = []
for s in tqdm(tweets):
    s = normalize_text(s)
    l = []
    try:
        doc = nlp(s)
        all_linked_entities = doc._.linkedEntities
        for ent in all_linked_entities:
            l.append((ent.label, ent.url))
        entities_tweets_spacy.append(l)
    except:
        ex_tweet.append(tweets.index(s))


In [None]:
df.iloc[0, 1:10].tolist()

In [None]:
from collections import Counter

In [None]:
entities_misinfo = []
entities_normal = []

labels = df
c=0
for i in range(0, len(df)):
    labels = df.iloc[i, 1:10].tolist()
    if 3 in labels:
        c+=1
        entities_misinfo.extend([t[0] for t in entities_tweets_spacy[i]])
    else:
        entities_normal.extend([t[0] for t in entities_tweets_spacy[i]])
count_misinfo = Counter(entities_misinfo)
count_normal = Counter(entities_normal)

all_entities = entities_misinfo+entities_normal
count = Counter(all_entities)

In [None]:
n_misinfo = c
n_normal = len(df)-c

In [None]:
top = count.most_common()[:100]
top

In [None]:
x = []
y = []
l = []
for t in top:
    s = t[0]
    x.append(s)
    y.append(t[1])
    ratio_misinfo = count_misinfo[s]/n_misinfo
    ratio_normal = count_normal[s]/n_normal
    tmp = 'Both'
    if ratio_misinfo>2*ratio_normal:
        tmp = 'Misinfo'
    if ratio_normal>2*ratio_misinfo:
        tmp = 'Normal'
    l.append(tmp)
    

In [None]:
palette = []
for lab in l:
    if lab=='Both':
        palette.append('lightgrey')
    elif lab=='Misinfo':
        palette.append('lightblue')
    else:
        palette.append('orange')

In [None]:
bar_df = pd.DataFrame({'entities':x, 'entities count':y, 'label':l})

legend_elements = [Line2D([0], [0], color='lightgrey', lw=20, label='Both'),
Line2D([0], [0], color='lightblue', lw=20, label='Misinfo Spreader'),
Line2D([0], [0], color='orange', lw=20, label='Normal User')]

fig, ax = plt.subplots()
ax.legend(handles=legend_elements, loc='center')

sns.set(style="darkgrid")
sns.set(font_scale=3)
sns.barplot(data=bar_df, x='entities count', y='entities', palette=palette, orient='h')
_=plt.xticks([50*i for i in range(1, 10)])

In [None]:
top[0][0]

# MLP

In [None]:
class MLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Flatten(),
            nn.Linear(32, 16),
            nn.Dropout(p=0.1),
            nn.ReLU(),
            nn.Linear(16, 8),
            nn.Dropout(p=0.1),
            nn.ReLU(),
            nn.Linear(8, 1)
        )

    def forward(self, x):
    
        return self.layers(x)


In [None]:
mlp = MLP()
mlp.to('cuda')

In [None]:
weights = [len(train_y)/train_y.tolist().count(0), len(train_y)/train_y.tolist().count(1)]
weights = torch.Tensor(weights).to('cuda')
weights

In [None]:
criterion = nn.BCELoss()

sig = nn.Sigmoid()

In [None]:
optimizer = AdamW(mlp.parameters(),
                  lr=3e-4,
                  weight_decay = 0.001)

scheduler = ReduceLROnPlateau(optimizer, patience=4, factor=0.3)

In [None]:
epochs = 200
best_MCC = 0
best_epoch = 0

for e in trange(0, epochs, position=0, leave=True):
    train_loss = 0
    
    mlp.train()
    
    print('Starting epoch ', e)
    
    
    for x, y in train_dataloader:
        
        x_features = []

        for i in x:
            i =str(int(i.item()))
            f = model[str(i)]
            x_features.append(f)

        x_features = torch.Tensor(x_features).to('cuda')
        outputs = mlp(x_features)
        y_hat = sig(outputs).flatten()
        
        y = y.to('cuda')
        loss = criterion(y_hat, y)
        
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()
    
    print('Train Loss = ', train_loss)
    
    
    test_loss = 0
    preds = []
    y_full = []
    mlp.eval()
    for x, y in test_dataloader:
        
        x_features = []

        for i in x:
            i =str(int(i.item()))
            f = model[str(i)]
            x_features.append(f)

        x_features = torch.Tensor(x_features).to('cuda')
        outputs = mlp(x_features)
        y_hat = sig(outputs).flatten()
        
        y = y.to('cuda')
        loss = criterion(y_hat, y)
        
        preds.extend((y_hat>0.5).long().cpu().tolist())
        y_full.extend(y.long().cpu().tolist())
        test_loss += loss.item()  
        
    mcc = metrics.matthews_corrcoef(preds, y_full)
    
    if best_MCC<mcc:
        best_MCC = mcc
        best_epoch = e
        best_state_dict = copy.deepcopy(mlp.state_dict())
    
    print('\t\tTest Loss = ', test_loss)
    print('\t\tTest MCC = ', round(mcc, 3))
    print('\n')
    print('---'*25)

In [None]:
best_epoch, best_MCC

In [None]:
torch.save(best_state_dict, '../../../mediaeval22/models/task2_MLP+_CV'+str(k)+'_e'+str(best_epoch)+'_'+str(round(best_MCC, 3))+'.pth')


# Inference

In [None]:
!ls ../../../mediaeval22/models

In [None]:
mlp.load_state_dict(torch.load('../../../mediaeval22/models/task2_MLP+_CV4_e163_0.455.pth'))
mlp.eval()

In [None]:
mlp.eval()
preds = []
tids = test_df['user_id'].tolist()
for x in test_dataloader:

    x_features = []
    x = x[0]
    for i in x:
        i =str(int(i.item()))
        f = model[str(i)]
        x_features.append(f)

    x_features = torch.Tensor(x_features).to('cuda')
    outputs = mlp(x_features)
    y_hat = sig(outputs).flatten()

    preds.extend((y_hat>0.5).long().cpu().tolist())


In [None]:
sub_df = pd.DataFrame()
sub_df['-1'] = tids
sub_df['0'] = [i+1 for i in preds]
sub_df

# Sklearn ML algorithms

In [None]:
# uncomment those you want to try
classifiers = [
    #KNeighborsClassifier(),
    #SVC(),
    #GaussianProcessClassifier(),
    #DecisionTreeClassifier(),
    RandomForestClassifier(),
    #MLPClassifier(max_iter=500),
    #AdaBoostClassifier(),
    #GaussianNB(),
    #QuadraticDiscriminantAnalysis(),
    #GradientBoostingClassifier()
]
MCCs = []
for i in range(0, len(classifiers)):
    MCCs.append([])

In [None]:
data_path = '../../../mediaeval22/'
filelist = os.listdir(data_path)
df_list = [pd.read_csv(data_path+file) for file in filelist if 'fold' in file]

test_preds = []
all_preds = []

for k in trange(0, 5):
    test_df = df_list[k]    
    train_df = pd.concat(df_list[:k]+df_list[k+1:])
    
    test_df2 = pd.read_csv('../../../mediaeval22/task_2_test.csv')
    
    train_x = torch.Tensor(train_df['user_id'].tolist())
    train_y = torch.Tensor(train_df['user_class_label'].tolist())-1

    test_x = torch.Tensor(test_df['user_id'].tolist())
    test_y = torch.Tensor(test_df['user_class_label'].tolist())-1
    
    test_x2 = torch.Tensor(test_df2['user_id'].tolist())
    
    batch_size = 512
    train_data = TensorDataset(train_x, train_y)
    test_data = TensorDataset(test_x, test_y)
    
    test_data2 = TensorDataset(test_x2)

    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

    test_sampler = SequentialSampler(test_data)
    test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)
    test_sampler2 = SequentialSampler(test_data2)
    test_dataloader2 = DataLoader(test_data2, sampler=test_sampler2, batch_size=batch_size)
    
    for c in range(0, len(classifiers)):
        clf = classifiers[c]

        x_features = []
        y_true = []
        for x, y in train_dataloader:
            y_true.extend(y.long().numpy().tolist())
            for i in x:
                i =str(int(i.item()))
                f = model[str(i)]
                x_features.append(f.tolist())

        clf.fit(x_features, y_true)

        x_features = []
        y_true = []
        for x, y in test_dataloader:
            y_true.extend(y.long().numpy().tolist())
            for i in x:
                i =str(int(i.item()))
                f = model[str(i)]
                x_features.append(f.tolist())
        preds = clf.predict(x_features).tolist()
        all_preds.append(preds)
        
        mcc = metrics.matthews_corrcoef(preds, y_true)
        MCCs[c].append(mcc)
    
        x_features = []
        for x in test_dataloader2:
            x=x[0]
            for i in x:
                i =str(int(i.item()))
                f = model[str(i)]
                x_features.append(f.tolist())
        test_preds.append(clf.predict(x_features).tolist())
        

In [None]:
df = pd.read_csv('../../../mediaeval22/submissions/mlp-4.csv', names=['-1', '0'])
df