In [None]:
import spacy 
import string
import re
import numpy as np
import pandas as pd

from collections import Counter
from tqdm.notebook import tqdm

import gensim
import gensim.corpora as corpora
from gensim.models import CoherenceModel

from nltk.corpus import stopwords

from sentence_transformers import SentenceTransformer, util

from sklearn import metrics, preprocessing, linear_model
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer, CountVectorizer
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.linear_model import SGDClassifier, SGDRegressor, LogisticRegression, LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVR

pd.set_option('display.max_colwidth', None)
en_stopwords = stopwords.words('english')

In [None]:
def computeMCC(y_test, y_pred):
    value = 0
    for y1, y2 in zip(y_test, y_pred):
        try:
            value += metrics.matthews_corrcoef(y1, y2)
        except ValueError:
            print(y1)
            print(y2)
            exit(1)
    mcc = value / len(y_test)
    return mcc

def computeMCCclass(y_test, y_pred):
    mccs = []
    for i in range(len(y_test[0,:])):
        mccs.append(metrics.matthews_corrcoef(y_test[:,i], y_pred[:,i]))
    return np.mean(mccs)

def one_hot_encoding(labels):
    dictionary = {0: [1, 0,],
                  1: [0, 1,]}
    enc_labels = []
    for el in labels:
        enc_labels.append(dictionary[el])
    return np.array(enc_labels)

In [None]:
sbert = SentenceTransformer('ynie/roberta-large-snli_mnli_fever_anli_R1_R2_R3-nli')

In [None]:
ls ../data/

In [None]:
data = pd.read_csv('../data/dev-full-task-2-clean.csv')

In [None]:
seed_words = ['sars', 'covid', 'corona']
found = set()

for seed in seed_words:
    for tweet in data.tweet:
        for word in tweet.translate(str.maketrans('', '', string.punctuation)).split(' '):
            if seed in word.lower():
                found.add(word)

In [None]:
list(found)

In [None]:
id2label = {0: 'Suppressed cures',
            1: 'Behaviour and Mind Control',
            2: 'Antivax',
            3: 'Fake virus',
            4: 'Intentional Pandemic',
            5: 'Harmful Radiation',
            6: 'Population reduction',
            7: 'New World Order',
            8: 'Satanism'}

In [None]:
data_folds = [pd.read_csv(f'../data/task2/dev-full-split-{i}.csv') for i in range(5)]
folds = [pd.read_csv(f'../data/task2/dev-full-split-{i}.csv').ids.values for i in range(5)]

In [None]:
sbert_X = sbert.encode(data.tweet.values)

In [None]:
data[['1', '2', '3', '4', '5', '6', '7', '8', '9']].mean()

# Antivax

In [None]:
data[data['9'] == 1]

In [None]:
hypotheses = ['Refusals to vaccinate, delaying vaccines, or using certain vaccines but not others. Total opposition to vaccination.',
              'I will not vaccinate because vaccines are a lie.',
              'Vaccines are a hoax']

In [None]:
for hypothesis in hypotheses:
    hypo = sbert.encode([hypothesis])[0]
    print('Hypothesis: '+ hypothesis)

    sim_matrix = util.pytorch_cos_sim(hypo, sbert_X).numpy()[0]
    scores_nli = []

    for k, fold in enumerate(folds):
        train_indices = data[~data.ids.isin(fold)].index
        test_indices = data[data.ids.isin(fold)].index

        Y_train, Y_test = data.iloc[train_indices]['3'], data.iloc[test_indices]['3']

        Y_prob = sim_matrix[test_indices]
        Y_pred = np.array(Y_prob >= 0.5).astype(int)

        acc = metrics.accuracy_score(Y_test, Y_pred)
        f1s = metrics.f1_score(Y_test, Y_pred, average='weighted')
        auc = metrics.roc_auc_score(Y_test, Y_prob, average='weighted', multi_class='ovr')
        mcc = metrics.matthews_corrcoef(Y_test, Y_pred)
        # mccc = computeMCCclass(one_hot_encoding(Y_test), one_hot_encoding(Y_pred))

        scores_nli.append({'ACC':acc, 'F1':f1s, 'AUC':auc, 'MCC': mcc})

        print(f'For fold {k}:', ' - '.join(f'{m}: {s:.4}' for m,s in scores_nli[-1].items()))
    print()
    scores_nli = pd.DataFrame(scores_nli)

# Satanism

In [None]:
hypothesis = 'Satanism is a group of ideological and philosophical beliefs based on Satan. Satanism existed primarily as an accusation by various Christian groups toward perceived ideological opponents, rather than a self-identity. '
hypothesis = 'The Mark of the Beast, Covid vaccine?  (Revelation 13) both small and great, both rich and poor, both free and slave, to be marked on the right hand or the forehead, so that no one can buy or sell unless he has the mark, that is, the name of the beast or the number of its name.'
hypo = sbert.encode([hypothesis])[0]

satanism_tweets = data[data['9'] == 1].tweet.values

sim_matrix = util.pytorch_cos_sim(hypo, sbert_X).numpy()[0]

In [None]:
scores_nli = []

best_mcc = (0, 0)

for i, tweet in enumerate(satanism_tweets):
    if i in [54, 6, 58, 72]:
        continue
    avg_mcc = 0
    hypo = sbert.encode([tweet])[0]
    sim_matrix_tweet = util.pytorch_cos_sim(hypo, sbert_X).numpy()[0]
    
    for k, fold in enumerate(folds):
        train_indices = data[~data.ids.isin(fold)].index
        test_indices = data[data.ids.isin(fold)].index

        Y_train, Y_test = data.iloc[train_indices]['9'], data.iloc[test_indices]['9']

        Y_prob = sim_matrix_tweet[test_indices]
        Y_pred = np.array(Y_prob >= 0.5).astype(int)

        
        mcc = metrics.matthews_corrcoef(Y_test, Y_pred)
        avg_mcc += mcc
        
    avg_mcc /= 5
    if avg_mcc > best_mcc[0]:
        best_mcc = (avg_mcc, i)

print(best_mcc, satanism_tweets[best_mcc[1]])

In [None]:
super_tweet = '. '.join(s[:-1] for s in satanism_tweets[[54]]) + '.'
super_tweet

In [None]:
avg_mcc = 0
hypo = sbert.encode([super_tweet])[0]
sim_matrix_tweet = util.pytorch_cos_sim(hypo, sbert_X).numpy()[0]

for k, fold in enumerate(folds):
    train_indices = data[~data.ids.isin(fold)].index
    test_indices = data[data.ids.isin(fold)].index

    Y_train, Y_test = data.iloc[train_indices]['9'], data.iloc[test_indices]['9']

    Y_prob = sim_matrix_tweet[test_indices]
    Y_pred = np.array(Y_prob >= 0.5).astype(int)


    mcc = metrics.matthews_corrcoef(Y_test, Y_pred)
    avg_mcc += mcc

avg_mcc /= 5
print(avg_mcc)

# New World Order

In [None]:
hypothesis = 'The New World Order is a conspiracy theory which hypothesizes a secretly emerging totalitarian world government.'
hypo = sbert.encode([hypothesis])[0]

sim_matrix = util.pytorch_cos_sim(hypo, sbert_X).numpy()[0]

In [None]:
scores_nli = []

for k, fold in enumerate(folds):
    train_indices = data[~data.ids.isin(fold)].index
    test_indices = data[data.ids.isin(fold)].index
    
    Y_train, Y_test = data.iloc[train_indices]['1'], data.iloc[test_indices]['1']

    Y_prob = sim_matrix[test_indices]
    Y_pred = np.array(Y_prob >= 0.5).astype(int)

    acc = metrics.accuracy_score(Y_test, Y_pred)
    f1s = metrics.f1_score(Y_test, Y_pred, average='weighted')
    auc = metrics.roc_auc_score(Y_test, Y_prob, average='weighted', multi_class='ovr')
    mcc = computeMCC(one_hot_encoding(Y_test), one_hot_encoding(Y_pred))
    mccc = computeMCCclass(one_hot_encoding(Y_test), one_hot_encoding(Y_pred))
    
    scores_nli.append({'ACC':acc, 'F1':f1s, 'AUC':auc, 'MCC': mcc, 'MCCC':mccc})
    
    print(f'For fold {k}:', ' - '.join(f'{m}: {s:.4}' for m,s in scores_nli[-1].items()))

scores_nli = pd.DataFrame(scores_nli)

# Using Tweets to Predict

In [None]:
mccs = {}
for tweet_category in tqdm([str(t) for t in range(1, 10)]):
    mccs[tweet_category] = []
    relevant_tweets = data[data[tweet_category] == 1].tweet.values
    for i, tweet in enumerate(relevant_tweets):
        avg_mcc = 0
        hypo = sbert.encode([tweet])[0]
        sim_matrix_tweet = util.pytorch_cos_sim(hypo, sbert_X).numpy()[0]

        for k, fold in enumerate(folds):
            train_indices = data[~data.ids.isin(fold)].index
            test_indices = data[data.ids.isin(fold)].index

            Y_train, Y_test = data.iloc[train_indices][tweet_category], data.iloc[test_indices][tweet_category]

            Y_prob = sim_matrix_tweet[test_indices]
            Y_pred = np.array(Y_prob >= 0.5).astype(int)

            if sum(Y_pred) > 0:
                mcc = metrics.matthews_corrcoef(Y_test, Y_pred)
                avg_mcc += mcc

        avg_mcc /= len(folds)
        mccs[tweet_category].append((avg_mcc, i))

    print('For', id2label[int(tweet_category) - 1])
    best_mccs = sorted(mccs[tweet_category], key=lambda x: -x[0])
    for supertweet_size in range(1, 12, 2):
        
        print('MCC for supertweet_size =', supertweet_size, ': ', end='')
        supertweet = '[SEP]'.join(relevant_tweets[j[1]][:-1].lower() for j in best_mccs[:supertweet_size]) + '.'
        hypo = sbert.encode([supertweet])[0]
        sim_matrix_tweet = util.pytorch_cos_sim(hypo, sbert_X).numpy()[0]
        
        for k, fold in enumerate(folds):
            train_indices = data[~data.ids.isin(fold)].index
            test_indices = data[data.ids.isin(fold)].index

            Y_train, Y_test = data.iloc[train_indices][tweet_category], data.iloc[test_indices][tweet_category]

            Y_prob = sim_matrix_tweet[test_indices]
            Y_pred = np.array(Y_prob >= 0.5).astype(int)

            if sum(Y_pred) > 0:
                mcc = metrics.matthews_corrcoef(Y_test, Y_pred)
                avg_mcc += mcc

        avg_mcc /= len(folds)
        print(avg_mcc)
    

In [None]:
l = [0.37400027228104693, 0.26894977597247083, 0.3339076702319483, 0.4949745459618075, 0.24668037087907382,
     0.25236930172231153, 0.5345499352319787, 0.40177709057511046, 0.2003782867163418]
print(sum(l) / len(l))

# Multiclass entailment

In [None]:
from transformers import pipeline
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
candidate_labels = list(id2label.values())
candidate_labels

In [None]:
candidate_labels = ['Suppressed cures', 
                    'Behaviour and Mind Control', 
                    'Antivax', 
                    'Fake virus', 
                    'Intentional Pandemic', 
                    'Harmful Radiation', 
                    'Population reduction', 
                    'New World Order', 
                    'Satanism']

In [None]:
results = {}
thresh = 0.5
for k, fold in tqdm(enumerate(folds)):
    train_indices = data[~data.ids.isin(fold)].index
    test_indices = data[data.ids.isin(fold)].index

    X_train, X_test = data.tweet[train_indices], data.tweet[test_indices]
    Y_train, Y_test = data.iloc[train_indices][[str(i) for i in range(1, 10)]], data.iloc[test_indices][[str(i) for i in range(1, 10)]]
    
    per_class_true = {c:[] for c in range(len(candidate_labels))}
    per_class_pred = {c:[] for c in range(len(candidate_labels))}

    for i, tweet in tqdm(enumerate(X_test), total=len(X_test)):
        # print(i, test_indices[i], tweet)
        # print('True label:', Y_test.values[i])
        output = classifier(tweet, candidate_labels, multi_label=True)
        
        for j, s in enumerate(output['scores']):
            per_class_true[j].append(Y_test.values[i][j])
            per_class_pred[j].append(int(s > thresh))
        
    # print(per_class_true)
    # print(per_class_pred)
    results[k] = (per_class_true, per_class_true)

In [None]:
%time
rest = classifier(list(X_test.values), candidate_labels, multi_label=True)