In [1]:
import torch
import gensim
import pickle
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tqdm import tqdm_notebook as tqdm
from spacy.lang.fr import French
from src.models import SentenceClassifier
from src.dataset import Lexicon, tensorFromSentence, DocumentDataset
from src.util import padSentence, getOffsets, getBestSplit

In [2]:
MAX_LEN = 50
device = torch.device("cuda")
df = pd.read_csv("dataset_10k.csv", low_memory=False)
nlp = French()
sbd = nlp.create_pipe('sentencizer')
nlp.add_pipe(sbd)
embeddings_file = "../frWac_non_lem_no_postag_no_phrase_200_cbow_cut100.bin"
embeddings = gensim.models.KeyedVectors.load_word2vec_format(embeddings_file, binary=True, unicode_errors='ignore')
embeddings_tensor = torch.FloatTensor(embeddings.vectors)
lexicon = Lexicon(embeddings)
model = SentenceClassifier(embeddings_tensor).to(device)
model.load_state_dict(torch.load("models/sentence_clf.pt"))
dataset = DocumentDataset(ds_loc="data/camemBERT_representations_64/")

In [26]:
df.iloc[1]["Facts"] == df.iloc[0]["Facts"]

True

In [28]:
offsets = []
acc = []
pre = []
rec = []
f1 = []
for i in tqdm(range(10360), desc="Gathering offsets"):
    doc = df.iloc[i]["Facts"] + df.iloc[i]["Analyses"]
    doc = doc.replace("- ",". ").replace(";",".").replace("\n"," ").replace(":", ".")
    doc = nlp(doc)
    doc = list(doc.sents)
    doc = [str(sentence) for sentence in doc]
    doc_len = len(doc)
#     print(doc_len)
    doc = [sentence for sentence in doc if len(sentence) > 3]
    sentences = []
    for sentence in doc:
        sentence = tensorFromSentence(lexicon, sentence).to(device)
        sentence = padSentence(sentence)
        output = model(sentence.unsqueeze(0), lengths=torch.tensor([len(sentence)]).to(device))
        output = output.item()
        output = round(output)
        sentences.append(output)
#         print(sentences)
    doc = len(getBestSplit(sentences, gamma=1))
    
    facts = df.iloc[i]["Facts"]
    facts = facts.replace("- ",". ").replace(";",".").replace("\n"," ").replace(":", ".")
    facts = nlp(facts)
    facts = list(facts.sents)
    facts = [str(sentence) for sentence in facts]
    facts = [sentence for sentence in facts if len(sentence) > 3]
    facts = len(facts)
    offsets.append(doc - facts)
    doc = torch.tensor(sentences)
    doc = torch.nn.functional.pad(doc, pad=(0, doc_len-len(doc)))
    facts = torch.ones(facts)
    facts = torch.nn.functional.pad(facts, pad=(0, doc_len-len(facts)))
#     print(doc.shape, facts.shape)
    acc.append(accuracy_score(doc, facts))
    pre.append(precision_score(doc, facts))
    rec.append(recall_score(doc, facts))
    f1.append(f1_score(doc, facts))
#     print("SENTS: {}".format(sentences))

acc = sum(acc)/len(acc)
pre = sum(pre)/len(pre)
rec = sum(rec)/len(rec)
f1 = sum(f1)/len(f1)
print("acc: {}".format(acc))
print("pre: {}".format(pre))
print("rec: {}".format(rec))
print("f1: {}".format(f1))

HBox(children=(IntProgress(value=0, description='Gathering offsets', max=10360, style=ProgressStyle(descriptio…

acc: 0.9259133293995797
pre: 0.919437758302076
rec: 0.9206487344421237
f1: 0.9113609863281781


In [29]:
getOffsets(offsets)

{'<-4': 2810,
 '-4': 135,
 '-3': 92,
 '-2': 127,
 '-1': 95,
 '0': 2291,
 '1': 4573,
 '2': 180,
 '3': 35,
 '4': 6,
 '>4': 16}