In [None]:
import utils
import embeddings
from experiments import Experiments
import numpy as np
from vectors import vectors
from cosine_sim import BiweeklyCosineDissimilarity
from cumulative_cosine_sim import BiweeklyCumulativeDissimilarity
from mmd_calculation import BiweeklyMMD
from change_point_detection import ChangePointDetector

In [None]:
path_to_dataset = 'fakerecogna_abstrativo.xlsx'

# formatting the dataset
utils.FakeRecogna2(path_to_dataset)

dataset = utils.load()

In [None]:
# generate embeddings
doc2vec = embeddings.Doc2Vec()
word2vec = embeddings.Word2Vec()

word_embeddings = [np.zeros((dataset.shape[0],1024)),
           np.zeros((dataset.shape[0],100))]

word_embeddings[0] = doc2vec.getVector(dataset)
np.save('.\\fakerecogna2\\word_embeddings\\BERT.npy', word_embeddings[0])
word_embeddings[1] = word2vec.getVector(dataset)
np.save('.\\fakerecogna2\\word_embeddings\\word2vec.npy', word_embeddings[1])

In [None]:
# loading the dataset
models = ['BERT','word2vec']
dataset, word_embeddings = utils.load(models)

In [None]:
# preparing the experiments
dataset = dataset.loc[(dataset.date >= '2020-01-01') & (dataset.date <= '2021-12-31')]
exp = Experiments(dataset,size=40,N=5)

In [None]:
# tests on chronological data
results_BERT = exp.different_distribution(word_embeddings['BERT'])
results_word2vec = exp.different_distribution(word_embeddings['word2vec'])

In [None]:
# tests on random data
results_BERT = exp.same_distribution_test(word_embeddings['BERT'])
results_word2vec = exp.same_distribution_test(word_embeddings['word2vec'])

In [None]:
import pandas as pd

# WMD on chronological data
WMD = exp.WMD()
dt_wmd = pd.DataFrame(columns=['label', 'week', 'mwmd'])
weeks = pd.date_range('2020-01-01','2021-12-31', freq='2W').to_pydatetime()

for i in range(1,WMD.shape[1]):
    temp = WMD[0][i].mean(axis=1).mean(axis=2)[i-1,:].T
    dt_wmd = pd.concat([dt_wmd,pd.DataFrame({'mwmd':temp, 'label':True, 'week':weeks[i-1]})])
for i in range(1,WMD.shape[1]):
    temp = WMD[1][i].mean(axis=1).mean(axis=2)[i-1,:].T
    dt_wmd = pd.concat([dt_wmd,pd.DataFrame({'mwmd':temp, 'label':False, 'week':weeks[i-1]})])
dt_wmd.reset_index(drop=True, inplace=True)

In [None]:
# WMD on random data
WMD = exp.same_distribution_WMD()
dt_wmd = pd.DataFrame(columns=['label', 'week', 'mwmd'])
weeks = pd.date_range('2020-01-01','2021-12-31', freq='2W').to_pydatetime()

for i in range(1,WMD.shape[1]):
    temp = WMD[0][i].mean(axis=1).mean(axis=2)[i-1,:].T
    dt_wmd = pd.concat([dt_wmd,pd.DataFrame({'mwmd':temp, 'label':True, 'week':weeks[i-1]})])
for i in range(1,WMD.shape[1]):
    temp = WMD[1][i].mean(axis=1).mean(axis=2)[i-1,:].T
    dt_wmd = pd.concat([dt_wmd,pd.DataFrame({'mwmd':temp, 'label':False, 'week':weeks[i-1]})])
dt_wmd.reset_index(drop=True, inplace=True)

In [None]:
emb = vectors(
    model_name="neuralmind/bert-large-portuguese-cased",
    model_kwargs={"device": "cuda"},
    encode_kwargs={"normalize_embeddings": False},
)

df = emb.attach(dataset, text_col="text")

In [None]:
# cosine similarity

cos = BiweeklyCosineDissimilarity(
    df=df,
    date_col="date",
    emb_col="embeddings",
    freq="2W",
)

cos.compute()

In [None]:
# cumulative cosine similarity

cum_cos = BiweeklyCumulativeDissimilarity(
    df=df,
    date_col="date",
    emb_col="embeddings",
    freq="2W",
)

cum_cos.compute()

In [None]:
# change point detection

cpd = ChangePointDetector(
    df=df,
    date_col="date",
    emb_col="embeddings",
    freq="D",
    model="rbf",
    algo="binseg")

In [None]:
# MMD

mmd = BiweeklyMMD(
    df=df,
    date_col="date",
    emb_col="embeddings",
    freq="2W",
    backend="pytorch",
    p_val=0.05,
)

mmd.compute()