In [None]:
import spacy 
import re
import numpy as np
import pandas as pd

from collections import Counter
from tqdm.notebook import tqdm

import gensim
import gensim.corpora as corpora
from gensim.models import CoherenceModel

from nltk.corpus import stopwords

from sentence_transformers import SentenceTransformer, util

from sklearn import metrics, preprocessing, linear_model
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer, CountVectorizer
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.linear_model import SGDClassifier, SGDRegressor, LogisticRegression, LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVR

pd.set_option('display.max_colwidth', None)
en_stopwords = stopwords.words('english')

In [None]:
data = pd.read_csv('../data/dev-full-task-1-clean.csv')

In [None]:
id2label = lambda i: {3: 'Promotes/Supports Conspiracy', 2: 'Discusses Conspiracy', 1:'Non-Conspiracy'}[i]

In [None]:
data['label'] = data['1'].apply(id2label)

In [None]:
data[['tweet', 'label']]

In [None]:
data

In [None]:
data['tweet'][data['tweet'].str.contains('presidential election')]

In [None]:
data['1'].value_counts() / len(data)

In [None]:
spacy_nlp = spacy.load('en_core_web_sm')

def remove_sw(tokens):
    return [tok.text for tok in tokens if tok.is_stop is False]


def remove_extra_spaces(tokens):
    return [tok.strip() for tok in tokens]


def remove_short_words(tokens):
    return [tok for tok in tokens if len(tok) > 2]


def remove_puntaction(tokens):
    return [re.sub('[\W]+', '', tok.lower()) for tok in tokens]


def tokenizer_call(text, spacy_nlp=spacy_nlp):
    tokenizer = spacy_nlp.tokenizer
    tokens = tokenizer(text)
    tokens = remove_sw(tokens)
    tokens = remove_puntaction(tokens)
    tokens = remove_extra_spaces(tokens)
    tokens = remove_short_words(tokens)
    return tokens

In [None]:
vectorizer = TfidfVectorizer(tokenizer=tokenizer_call, lowercase=True, ngram_range=(1, 3), min_df=2)
tfidf_X = vectorizer.fit_transform(data.tweet.values)

In [None]:
vectorizer.get_feature_names()

In [None]:
y = data['1']

In [None]:
X_train_idx, X_test_idx, y_train, y_test = train_test_split(range(len(data)), y, stratify=y, test_size=0.25, random_state=0)

In [None]:
X_train, X_test = tfidf_X[X_train_idx], tfidf_X[X_test_idx]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(tfidf_X, y, stratify=y, test_size=0.25, random_state=0)

# Classifiers

In [None]:
reg = linear_model.LinearRegression()

In [None]:
reg.fit(X_train, y_train)

In [None]:
word_weights = dict(zip(vectorizer.get_feature_names(), reg.coef_))

In [None]:
sorted(word_weights.items(), key=lambda x: -x[1])[:50]

In [None]:
cv = CountVectorizer(min_df=2, stop_words='english', lowercase=True, ngram_range=(1, 3))
X_vec = cv.fit_transform(data.tweet.values)

mut_information_scores = dict(zip(cv.get_feature_names(), mutual_info_classif(X_vec, data['1'], discrete_features=True)))

In [None]:
sorted(mut_information_scores.items(), key=lambda x: x[1])[:50]

In [None]:
sorted(mut_information_scores.items(), key=lambda x: -x[1])[:50]

# Computing Features

# Evaluation

In [None]:
! ls ../data/task2

In [None]:
data_folds = [pd.read_csv(f'../data/task1/dev-full-split-{i}.csv') for i in range(5)]
folds = [pd.read_csv(f'../data/task1/dev-full-split-{i}.csv').ids.values for i in range(5)]

In [None]:
data_folds[0]

In [None]:
data[data.ids.isin(folds[0])].head(10)

In [None]:
def tokenize(s):
    return ' '.join(tokenizer_call(s))

In [None]:
vectorizer = TfidfVectorizer(tokenizer=tokenizer_call, lowercase=True, ngram_range=(1, 3), min_df=2)
tfidf_X = vectorizer.fit_transform(data.tweet.values)

In [None]:
def computeMCC(y_test, y_pred):
    value = 0
    for y1, y2 in zip(y_test, y_pred):
        try:
            value += metrics.matthews_corrcoef(y1, y2)
        except ValueError:
            print(y1)
            print(y2)
            exit(1)
    mcc = value / len(y_test)
    return mcc

def computeMCCclass(y_test, y_pred):
    mccs = []
    for i in range(len(y_test[0,:])):
        mccs.append(metrics.matthews_corrcoef(y_test[:,i], y_pred[:,i]))
    return np.mean(mccs)

def one_hot_encoding(labels):
    dictionary = {1: [0, 0, 1],
                  2: [0, 1, 0],
                  3: [1, 0, 0]}
    enc_labels = []
    for el in labels:
        enc_labels.append(dictionary[el])
    return np.array(enc_labels)

In [None]:
Y_pred

In [None]:
data['1'].value_counts()

In [None]:
weights = {c: 1/np.log(v) for c, v in data['1'].value_counts().items()}
weights

In [None]:
scores_sgd_log = []
for i, fold in enumerate(folds):
    train_indices = data[~data.ids.isin(fold)].index
    test_indices = data[data.ids.isin(fold)].index
    
    X_train, X_test = tfidf_X[train_indices], tfidf_X[test_indices]
    Y_train, Y_test = data.iloc[train_indices]['1'], data.iloc[test_indices]['1']

    # sgd = SGDClassifier('log')
    sgd = SGDClassifier('log', class_weight='balanced')
    sgd.fit(X_train, Y_train)
    
    Y_pred = sgd.predict(X_test)
    Y_prob = sgd.predict_proba(X_test)
    acc = metrics.accuracy_score(Y_test, Y_pred)
    f1s = metrics.f1_score(Y_test, Y_pred, average='weighted')
    auc = metrics.roc_auc_score(Y_test, Y_prob, average='weighted', multi_class='ovr')
    mcc = computeMCC(one_hot_encoding(Y_test), one_hot_encoding(Y_pred))
    mccc = computeMCCclass(one_hot_encoding(Y_test), one_hot_encoding(Y_pred))
    print(mccc)
    
    scores_sgd_log.append({'ACC':acc, 'F1':f1s, 'AUC':auc, 'MCC': mcc, 'MCCC':mccc})
    
    print(f'For fold {i}:', ' - '.join(f'{m}: {s:.4}' for m,s in scores_sgd_log[-1].items()))

scores_sgd_log = pd.DataFrame(scores_sgd_log)

In [None]:
scores_sgd_log.describe().loc['mean']

In [None]:
scores_sgd_log.describe().loc['mean']

# SBERT

In [None]:
#stsb-xlm-r-multilingual
#distiluse-base-multilingual-cased-v1
#paraphrase-xlm-r-multilingual-v1
#sentence-transformers/all-mpnet-base-v2

sbert1 = SentenceTransformer('facebook/bart-large-mnli') 
sbert2 = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')
# sbert3 = SentenceTransformer('multi-qa-mpnet-base-dot-v1') 

In [None]:
%%time
sbert1_X = sbert1.encode(data.tweet.values)
sbert1_X.shape

In [None]:
%%time
sbert2_X = sbert2.encode(data.tweet.values)
sbert2_X.shape

In [None]:
scores_sbert1 = []
for i, fold in enumerate(folds):
    train_indices = data[~data.ids.isin(fold)].index
    test_indices = data[data.ids.isin(fold)].index
    
    X_train, X_test = sbert1_X[train_indices], sbert1_X[test_indices]
    Y_train, Y_test = data.iloc[train_indices]['1'], data.iloc[test_indices]['1']

    sgd = SGDClassifier('log')
    sgd.fit(X_train, Y_train)
    
    Y_pred = sgd.predict(X_test)
    Y_prob = sgd.predict_proba(X_test)
    acc = metrics.accuracy_score(Y_test, Y_pred)
    f1s = metrics.f1_score(Y_test, Y_pred, average='weighted')
    auc = metrics.roc_auc_score(Y_test, Y_prob, average='weighted', multi_class='ovr')
    mcc = computeMCC(one_hot_encoding(Y_test), one_hot_encoding(Y_pred))
    mccc = computeMCCclass(one_hot_encoding(Y_test), one_hot_encoding(Y_pred))
    print(mccc)
    
    scores_sbert1.append({'ACC':acc, 'F1':f1s, 'AUC':auc, 'MCC': mcc, 'MCCC':mccc})
    
    print(f'For fold {i}:', ' - '.join(f'{m}: {s:.4}' for m,s in scores_sbert1[-1].items()))

scores_sbert1_log = pd.DataFrame(scores_sbert1)

In [None]:
sum(sbert2_X[0])

In [None]:
scores_sbert2 = []
for i, fold in enumerate(folds):
    train_indices = data[~data.ids.isin(fold)].index
    test_indices = data[data.ids.isin(fold)].index
    
    X_train, X_test = sbert2_X[train_indices], sbert2_X[test_indices]
    Y_train, Y_test = data.iloc[train_indices]['1'], data.iloc[test_indices]['1']

    sgd = SGDClassifier('log')
    sgd.fit(X_train, Y_train)
    
    Y_pred = sgd.predict(X_test)
    Y_prob = sgd.predict_proba(X_test)
    acc = metrics.accuracy_score(Y_test, Y_pred)
    f1s = metrics.f1_score(Y_test, Y_pred, average='weighted')
    auc = metrics.roc_auc_score(Y_test, Y_prob, average='weighted', multi_class='ovr')
    mcc = computeMCC(one_hot_encoding(Y_test), one_hot_encoding(Y_pred))
    mccc = computeMCCclass(one_hot_encoding(Y_test), one_hot_encoding(Y_pred))
    print(mccc)
    
    scores_sbert2.append({'ACC':acc, 'F1':f1s, 'AUC':auc, 'MCC': mcc, 'MCCC':mccc})
    
    print(f'For fold {i}:', ' - '.join(f'{m}: {s:.4}' for m,s in scores_sbert2[-1].items()))

scores_sbert2_log = pd.DataFrame(scores_sbert2)

In [None]:
scores_sbert1 = []
for i, fold in enumerate(folds):
    train_indices = data[~data.ids.isin(fold)].index
    test_indices = data[data.ids.isin(fold)].index
    
    X_train, X_test = sbert1_X[train_indices], sbert1_X[test_indices]
    Y_train, Y_test = data.iloc[train_indices]['1'], data.iloc[test_indices]['1']

    knn = KNeighborsClassifier(n_neighbors=5)
    knn.fit(X_train, Y_train)
    
    Y_pred = knn.predict(X_test)
    Y_prob = knn.predict_proba(X_test)
    acc = metrics.accuracy_score(Y_test, Y_pred)
    f1s = metrics.f1_score(Y_test, Y_pred, average='weighted')
    auc = metrics.roc_auc_score(Y_test, Y_prob, average='weighted', multi_class='ovr')
    mcc = computeMCC(one_hot_encoding(Y_test), one_hot_encoding(Y_pred))
    mccc = computeMCCclass(one_hot_encoding(Y_test), one_hot_encoding(Y_pred))
    print(mccc)
    
    scores_sbert1.append({'ACC':acc, 'F1':f1s, 'AUC':auc, 'MCC': mcc, 'MCCC':mccc})
    
    print(f'For fold {i}:', ' - '.join(f'{m}: {s:.4}' for m,s in scores_sbert1[-1].items()))

scores_sbert1_knn = pd.DataFrame(scores_sbert1)

In [None]:
scaled_sbert1_X = preprocessing.StandardScaler().fit_transform(sbert1_X)

In [None]:
sum(scaled_sbert1_X[0])

In [None]:
scores_sbert1 = []
for i, fold in enumerate(folds):
    train_indices = data[~data.ids.isin(fold)].index
    test_indices = data[data.ids.isin(fold)].index
    
    X_train, X_test = scaled_sbert1_X[train_indices], scaled_sbert1_X[test_indices]
    Y_train, Y_test = data.iloc[train_indices]['1'], data.iloc[test_indices]['1']

    knn = KNeighborsClassifier(n_neighbors=30)
    knn.fit(X_train, Y_train)
    
    Y_pred = knn.predict(X_test)
    Y_prob = knn.predict_proba(X_test)
    acc = metrics.accuracy_score(Y_test, Y_pred)
    f1s = metrics.f1_score(Y_test, Y_pred, average='weighted')
    auc = metrics.roc_auc_score(Y_test, Y_prob, average='weighted', multi_class='ovr')
    mcc = computeMCC(one_hot_encoding(Y_test), one_hot_encoding(Y_pred))
    mccc = computeMCCclass(one_hot_encoding(Y_test), one_hot_encoding(Y_pred))
    print(mccc)
    
    scores_sbert1.append({'ACC':acc, 'F1':f1s, 'AUC':auc, 'MCC': mcc, 'MCCC':mccc})
    
    print(f'For fold {i}:', ' - '.join(f'{m}: {s:.4}' for m,s in scores_sbert1[-1].items()))

scores_sbert1_scaled_knn = pd.DataFrame(scores_sbert1)

In [None]:
(0.4116 + 0.4791 + 0.4164 + 0.3468 + 0.3971)/5

# Topic Modeling

In [None]:
docs = [s.replace('&amp; ', '').split() for s in data.tweet.str.lower()]

In [None]:
T  = {}
for n_topics in tqdm([20, 50, 100]):
    for random_state in tqdm([0, 1]):
        print(f'N = {n_topics}, seed = {random_state}')
        # Create Dictionary
        docs_filtered = [[w for w in d if w not in en_stopwords] for d in docs]
        id2word = corpora.Dictionary(docs_filtered)

        # Term Document Frequency
        corpus = [id2word.doc2bow(text) for text in docs_filtered]

        num_topics = n_topics
        lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                                   id2word=id2word,
                                                   num_topics=num_topics, 
                                                   random_state=random_state,
                                                   passes=10, # 30 is too good
                                                   alpha='auto',
                                                   eta='auto',
                                                   per_word_topics=True,
                                                   minimum_probability=0.05)

        for t in lda_model.show_topics(40, num_words=8):
            print('Topic', t[0], end=': ')
            for w in t[1].split(' + '):
                print(w.split('*')[1], end=', ')
            print('')


        preds = [lda_model[p] for p in corpus]
        topic_docs = np.zeros((len(corpus), n_topics))

        for i, d in enumerate(preds):
            for (j, p) in d[0]:
                topic_docs[i][j] = p

        T['N='+str(n_topics)+', seed=' + str(random_state)] = topic_docs

In [None]:
T.keys()

In [None]:
scores_tm20_0 = []
for i, fold in enumerate(folds):
    train_indices = data[~data.ids.isin(fold)].index
    test_indices = data[data.ids.isin(fold)].index
    
    X_train, X_test = T['N=20, seed=1'][train_indices], T['N=20, seed=1'][test_indices]
    Y_train, Y_test = data.iloc[train_indices]['1'], data.iloc[test_indices]['1']

    sgd = SGDClassifier('log')
    sgd.fit(X_train, Y_train)
    
    Y_pred = sgd.predict(X_test)
    Y_prob = sgd.predict_proba(X_test)
    acc = metrics.accuracy_score(Y_test, Y_pred)
    f1s = metrics.f1_score(Y_test, Y_pred, average='weighted')
    auc = metrics.roc_auc_score(Y_test, Y_prob, average='weighted', multi_class='ovr')
    mcc = computeMCC(one_hot_encoding(Y_test), one_hot_encoding(Y_pred))
    mccc = computeMCCclass(one_hot_encoding(Y_test), one_hot_encoding(Y_pred))
    print(mccc)
    
    scores_tm20_0.append({'ACC':acc, 'F1':f1s, 'AUC':auc, 'MCC': mcc, 'MCCC':mccc})
    
    print(f'For fold {i}:', ' - '.join(f'{m}: {s:.4}' for m,s in scores_tm20_0[-1].items()))

scores_tm20_0 = pd.DataFrame(scores_tm20_0)

In [None]:
scores_tm20_0

# NLI

In [None]:
sim = util.pytorch_cos_sim(sbert2_X, sbert2_X).numpy()

In [None]:
sim_df = pd.DataFrame(sim)
sim_df['label'] = data['1']

In [None]:
sim_df[data['1'] == 1][0].mean()

In [None]:
sim_df[data['1'] == 2][0].mean()

In [None]:
sim_df[data['1'] == 3][0].mean()

In [None]:
sim_df

In [None]:
scores_nli_1nn = []

for i, fold in enumerate(folds):
    train_indices = data[~data.ids.isin(fold)].index
    test_indices = data[data.ids.isin(fold)].index
    
    X_train, X_test = sbert1_X[train_indices], sbert1_X[test_indices]
    Y_train, Y_test = data.iloc[train_indices]['1'], data.iloc[test_indices]['1']

    Y_pred = []
    Y_prob  = []
    for i in test_indices:
        sim_scores = sim_df.iloc[i]
        sorted_scores_idx = np.argsort(-sim_scores)
        for j in sorted_scores_idx:
            if j in train_indices:
                cls = data['1'].iloc[j]
                Y_pred.append(cls)
                Y_prob.append([[1, 0, 0], [0, 1, 0], [0, 0, 1]][cls - 1])
                break
    
    acc = metrics.accuracy_score(Y_test, Y_pred)
    f1s = metrics.f1_score(Y_test, Y_pred, average='weighted')
    auc = metrics.roc_auc_score(Y_test, Y_prob, average='weighted', multi_class='ovr')
    mcc = computeMCC(one_hot_encoding(Y_test), one_hot_encoding(Y_pred))
    mccc = computeMCCclass(one_hot_encoding(Y_test), one_hot_encoding(Y_pred))
    print(mccc)
    
    scores_nli_1nn.append({'ACC':acc, 'F1':f1s, 'AUC':auc, 'MCC': mcc, 'MCCC':mccc})
    
    print(f'For fold {i}:', ' - '.join(f'{m}: {s:.4}' for m,s in scores_nli_1nn[-1].items()))

scores_nli_knn = pd.DataFrame(scores_nli_knn)

In [None]:
scores_nli_knn = []

for k, fold in enumerate(folds):
    train_indices = data[~data.ids.isin(fold)].index
    test_indices = data[data.ids.isin(fold)].index
    
    X_train, X_test = sbert1_X[train_indices], sbert1_X[test_indices]
    Y_train, Y_test = data.iloc[train_indices]['1'], data.iloc[test_indices]['1']

    Y_pred = []
    Y_prob  = []
    for i in test_indices:
        sim_scores = sim_df.iloc[i]
        sorted_scores_idx = [j for j in np.argsort(-sim_scores) if j in train_indices]
        top_classes = [data['1'].iloc[j] for j in sorted_scores_idx[:25]]
        cls = Counter(top_classes).most_common()[0][0]
        Y_pred.append(cls)
        Y_prob.append([[1, 0, 0], [0, 1, 0], [0, 0, 1]][cls - 1])
        
    acc = metrics.accuracy_score(Y_test, Y_pred)
    f1s = metrics.f1_score(Y_test, Y_pred, average='weighted')
    auc = metrics.roc_auc_score(Y_test, Y_prob, average='weighted', multi_class='ovr')
    mcc = computeMCC(one_hot_encoding(Y_test), one_hot_encoding(Y_pred))
    mccc = computeMCCclass(one_hot_encoding(Y_test), one_hot_encoding(Y_pred))
    
    scores_nli_knn.append({'ACC':acc, 'F1':f1s, 'AUC':auc, 'MCC': mcc, 'MCCC':mccc})
    
    print(f'For fold {k}:', ' - '.join(f'{m}: {s:.4}' for m,s in scores_nli_knn[-1].items()))

scores_nli_avg = pd.DataFrame(scores_nli_avg)

In [None]:
scores_nli_avg.mean()

In [None]:
def softmax(vector):
    e = np.exp(vector)
    return e / e.sum()

In [None]:
scores_nli_avg = []

for k, fold in enumerate(folds):
    train_indices = data[~data.ids.isin(fold)].index
    test_indices = data[data.ids.isin(fold)].index
    
    X_train, X_test = sbert1_X[train_indices], sbert1_X[test_indices]
    Y_train, Y_test = data.iloc[train_indices]['1'], data.iloc[test_indices]['1']

    Y_pred = []
    Y_prob  = []
    
    class_embeddings = np.array([X_train[Y_train == c].mean(axis=0) for c in [1, 2, 3]])
    
    for i, x in enumerate(X_test):
        sim_scores = util.pytorch_cos_sim(x, class_embeddings).numpy()
        cls = np.argsort(-sim_scores)[0][0]
        Y_pred.append(cls+1)
        Y_prob.append(softmax(sim_scores[0]))

    acc = metrics.accuracy_score(Y_test, Y_pred)
    f1s = metrics.f1_score(Y_test, Y_pred, average='weighted')
    auc = metrics.roc_auc_score(Y_test, Y_prob, average='weighted', multi_class='ovr')
    mcc = computeMCC(one_hot_encoding(Y_test), one_hot_encoding(Y_pred))
    mccc = computeMCCclass(one_hot_encoding(Y_test), one_hot_encoding(Y_pred))
    
    scores_nli_avg.append({'ACC':acc, 'F1':f1s, 'AUC':auc, 'MCC': mcc, 'MCCC':mccc})
    
    print(f'For fold {k}:', ' - '.join(f'{m}: {s:.4}' for m,s in scores_nli_avg[-1].items()))

scores_nli_avg = pd.DataFrame(scores_nli_avg)

# Entailment

In [None]:

hg_model_hub_name = "ynie/roberta-large-snli_mnli_fever_anli_R1_R2_R3-nli"
# hg_model_hub_name = "ynie/albert-xxlarge-v2-snli_mnli_fever_anli_R1_R2_R3-nli"
# hg_model_hub_name = "ynie/bart-large-snli_mnli_fever_anli_R1_R2_R3-nli"
# hg_model_hub_name = "ynie/electra-large-discriminator-snli_mnli_fever_anli_R1_R2_R3-nli"
# hg_model_hub_name = "ynie/xlnet-large-cased-snli_mnli_fever_anli_R1_R2_R3-nli"

tokenizer = AutoTokenizer.from_pretrained(hg_model_hub_name)
model = AutoModelForSequenceClassification.from_pretrained(hg_model_hub_name)

In [None]:
max_length = 512


for k, fold in enumerate(folds):
    train_indices = data[~data.ids.isin(fold)].index
    test_indices = data[data.ids.isin(fold)].index
    
    X_train, X_test = data.tweets[train_indices], data.tweets[test_indices]
    Y_train, Y_test = data.iloc[train_indices]['1'], data.iloc[test_indices]['1']

    tokenized_input_seq_pair = tokenizer.encode_plus(premise, hypothesis,
                                                     max_length=max_length,
                                                     return_token_type_ids=True, truncation=True)


    input_ids = torch.Tensor(tokenized_input_seq_pair['input_ids']).long().unsqueeze(0)
    # remember bart doesn't have 'token_type_ids', remove the line below if you are using bart.
    token_type_ids = torch.Tensor(tokenized_input_seq_pair['token_type_ids']).long().unsqueeze(0)
    attention_mask = torch.Tensor(tokenized_input_seq_pair['attention_mask']).long().unsqueeze(0)

    outputs = model(input_ids,
                    attention_mask=attention_mask,
                    token_type_ids=token_type_ids,
                    labels=None)
    # Note:
    # "id2label": {
    #     "0": "entailment",
    #     "1": "neutral",
    #     "2": "contradiction"
    # },

    predicted_probability = torch.softmax(outputs[0], dim=1)[0].tolist()  # batch_size only one

    print("Premise:", premise)
    print("Hypothesis:", hypothesis)
    print("Entailment:", predicted_probability[0])
    print("Neutral:", predicted_probability[1])
    print("Contradiction:", predicted_probability[2])


# BEST TWEET

In [None]:
sbert_X = sbert2.encode(data.tweet.values)

In [None]:
for tweet_category in data.label.unique:
    mccs[tweet_category] = []
    relevant_tweets = data[data.label == tweet_category].tweet.values
    
    for i, tweet in enumerate(relevant_tweets):
        avg_mcc = 0
        hypo = sbert2.encode([tweet])[0]
        sim_matrix_tweet = util.pytorch_cos_sim(hypo, sbert_X).numpy()[0]

        for k, fold in enumerate(folds):
            train_indices = data[~data.ids.isin(fold)].index
            test_indices = data[data.ids.isin(fold)].index

            Y_train, Y_test = data.iloc[train_indices][tweet_category], data.iloc[test_indices][tweet_category]

            Y_prob = sim_matrix_tweet[test_indices]
            Y_pred = np.array(Y_prob >= 0.5).astype(int)

            if sum(Y_pred) > 0:
                mcc = metrics.matthews_corrcoef(Y_test, Y_pred)
                avg_mcc += mcc

        avg_mcc /= len(folds)
        mccs[tweet_category].append((avg_mcc, i))

    print('For', id2label[int(tweet_category) - 1], ': ')
    best_mccs = sorted(mccs[tweet_category], key=lambda x: -x[0])
    print(best_mccs[:5])
    for supertweet_size in range(1, 2):
        avg_mcc = 0
        # print('MCC for supertweet_size =', supertweet_size, ': ', end='')
        supertweet = ' and '.join(relevant_tweets[j[1]][:-1].lower() for j in best_mccs[:supertweet_size])
        hypo = sbert.encode([supertweet])[0]
        sim_matrix_tweet = util.pytorch_cos_sim(hypo, sbert_X).numpy()[0]
        
        for k, fold in enumerate(folds):
            train_indices = data[~data.ids.isin(fold)].index
            test_indices = data[data.ids.isin(fold)].index

            Y_train, Y_test = data.iloc[train_indices][tweet_category], data.iloc[test_indices][tweet_category]

            Y_prob = sim_matrix_tweet[test_indices]
            Y_pred = np.array(Y_prob >= 0.5).astype(int)

            if sum(Y_test) > 0:
                mcc = metrics.matthews_corrcoef(Y_test, Y_pred)
                avg_mcc += mcc
            
        avg_mcc /= len(folds)
        print('supertweet_size', supertweet_size, avg_mcc)