In [1]:
import pandas as pd
from os.path import join
from collections import Counter
from pathlib import Path
import random
from collections import defaultdict
from itertools import chain, groupby
from typing import Any, List, Optional, Union
import joblib
import numpy as np
import torch
from torch.utils.data import DataLoader
import lightgbm as lgb
from multiprocessing import Pool
from multiprocessing.pool import ThreadPool

from time import sleep

import sys
import json
from glob import glob
import matplotlib.pyplot as plt
import scikitplot as skplt
from sklearn.manifold import TSNE
from matplotlib import pyplot as plt
from tqdm.notebook import tqdm
StrOrPath = Union[Path, str]

ModuleNotFoundError: No module named 'scikitplot'

In [None]:
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
import pandas as pd
from glob import glob
import multiprocessing as mp
from multiprocessing.pool import ThreadPool

In [None]:
def read_file(file):
    return pd.read_csv(file, compression = 'gzip')

def get_labeled_preds():
    all_paths = glob(join('Format_sents', 'labeled_sent_*.csv'))
    all_data = []
    return read_dfs_parallel(files = all_paths)


def read_dfs_sequence(all_paths):
    for pth in tqdm(all_paths):
        all_data.append(pd.read_csv(pth, compression = 'gzip'))
        if len(all_data)>3:
            break
    return pd.concat(all_data, ignore_index = True)
def read_dfs_parallel(files):
#     with ThreadPoolExecutor(10) as pool:
#     with mp.Pool(10) as pool:
    with ThreadPool(20) as pool:
#         results = list(tqdm(pool.map(read_file, files), total=len(files)))
        results = list(tqdm(pool.imap_unordered(read_file, files), total=len(files)))
        results = pd.concat(results, ignore_index = True)
#         results = pd.concat(pool.map(read_file, files), ignore_index = True)
    return results
def get_origin_preds_index():
    df_1 = pd.read_csv(join('all_triples','origin_predication.csv'), compression = 'gzip')[['ORIGIN_ID','PREDICATION_ID']]
    df_2 = pd.read_csv(join('PREDICATION_AUX','all_combo_index_v2.csv'), compression = 'gzip')[['PREDICATION_AUX_ID','PREDICATION_ID']]
    return df_1.set_index('PREDICATION_ID').join(df_2.set_index('PREDICATION_ID'), how='inner').reset_index()

def get_origin_triples_labels():
    print('read predications factuality preds')
    lab_preds = pd.read_csv(join('all_triples','predications_aux_factuality.csv'), compression = 'gzip')
    print('read triples to predications ID map')
    aux_preds_to_origin = get_origin_preds_index()
    print('join and return')
    return lab_preds.set_index('PREDICATION_AUX_ID').join(aux_preds_to_origin.set_index('PREDICATION_AUX_ID'), how='inner').reset_index()
    
    

## Get All Predications Labels in One file

In [3]:
sents = get_labeled_preds()

KeyboardInterrupt: 

In [5]:
sents

Unnamed: 0,PREDICATION_AUX_ID,label
0,122252909,Fact
1,122252982,Fact
2,122253198,Fact
3,122253344,Fact
4,122253423,Fact
...,...,...
113805272,178428481,Fact
113805273,178428482,Fact
113805274,178428483,Fact
113805275,178428484,Fact


In [6]:
sents.head()

Unnamed: 0,PREDICATION_AUX_ID,label
0,122252909,Fact
1,122252982,Fact
2,122253198,Fact
3,122253344,Fact
4,122253423,Fact


In [7]:
sents.to_csv(join('all_triples', 'predications_aux_factuality.csv'), index = False, compression = 'gzip')

## Origin Factuality Extraction

In [12]:
sents = get_origin_triples_labels()

read predications factuality preds
read triples to predications ID map
join and return


In [13]:
sents.head()

Unnamed: 0,PREDICATION_AUX_ID,label,PREDICATION_ID,ORIGIN_ID
0,122252909,Fact,122252917,556236
1,122252982,Fact,122252999,2023646
2,122253198,Fact,122253217,2023646
3,122253344,Fact,122253360,2273984
4,122253423,Fact,122253441,55837


In [14]:
sents['ORIGIN_ID'].unique().shape

(23601734,)

In [15]:
sents['PREDICATION_ID'].unique().shape

(113805277,)

In [16]:
sents[['label','ORIGIN_ID']].to_csv(join('all_triples', 'origin_factuality.csv'), index = False, compression = 'gzip')


## Origin Factuality Count

In [2]:
def get_evidence_count(or_df):
    or_df['count_col'] = 1
    or_df = or_df.groupby(['ORIGIN_ID', 'label']).sum()
    or_df = or_df.reset_index()
    or_df = pd.pivot_table(or_df, values='count_col', index=['ORIGIN_ID'],
                       columns=['label'], aggfunc=np.sum, fill_value = 0)
    or_df['total'] = or_df.sum(axis=1)
    or_df = or_df.reset_index().rename_axis(None, axis=1)
    
    return or_df

def get_origin_proba_fact_count():
    print('Origin probas')
    probas = pd.read_csv(join('all_triples', 'triples_probabilities.csv'), compression = 'gzip')
    print('Canonized triples')
    can_triples = pd.read_csv(join('all_triples', 'origin_fact_count.csv'), compression = 'gzip')
    print('Join and Return')
    return can_triples.set_index('ORIGIN_ID').join(probas.set_index('ORIGIN_ID'), how='inner').reset_index()

def get_fact_percentage():
    probas = pd.read_csv(join('all_triples', 'origin_probabilities_with_count.csv'), compression = 'gzip')
    for class_col in ['Counterfact','Doubtful','Fact','Possible','Probable','Uncommitted']:
        probas[class_col+'%'] = probas[class_col]/probas['total']*100
        del probas[class_col]
    return probas.round(2)

In [33]:
triple_facts = pd.read_csv(join('all_triples', 'origin_factuality.csv'), compression = 'gzip')

In [34]:
triple_facts.head()

Unnamed: 0,label,ORIGIN_ID
0,Fact,556236
1,Fact,2023646
2,Fact,2023646
3,Fact,2273984
4,Fact,55837


In [35]:
triple_facts.shape

(113805277, 2)

In [38]:
origin_count = get_evidence_count(or_df = triple_facts.copy())

In [39]:
origin_count.to_csv(join('all_triples', 'origin_fact_count.csv'), index = False, compression = 'gzip')

In [40]:
origin_count

Unnamed: 0,ORIGIN_ID,Counterfact,Doubtful,Fact,Possible,Probable,Uncommitted,total
0,0,0,0,1,0,0,0,1
1,1,0,0,2,0,0,0,2
2,2,0,0,7,0,0,0,7
3,3,0,0,4,0,0,0,4
4,4,71,2,53226,31,67,487,53884
...,...,...,...,...,...,...,...,...
23601729,23857432,0,0,1,0,0,0,1
23601730,23857433,0,0,1,0,0,0,1
23601731,23857434,0,0,1,0,0,0,1
23601732,23857435,0,0,1,0,0,0,1


In [42]:
# del origin_count

In [46]:
probas = get_origin_proba_fact_count()

Origin probas
Canonized triples
Join and Return


In [48]:
probas

Unnamed: 0,ORIGIN_ID,Counterfact,Doubtful,Fact,Possible,Probable,Uncommitted,total,PREDICATE,SUBJECT_CUI,OBJECT_CUI,label_proba
0,0,0,0,1,0,0,0,1,PROCESS_OF,C0003725,C0999630,1.000000
1,1,0,0,2,0,0,0,2,ISA,C0039258,C0446169,1.000000
2,2,0,0,7,0,0,0,7,ISA,C0318627,C0206590,1.000000
3,3,0,0,4,0,0,0,4,ISA,C0446169,C0003725,1.000000
4,4,71,2,53226,31,67,487,53884,PROCESS_OF,C0012634,C0020114,0.989018
...,...,...,...,...,...,...,...,...,...,...,...,...
28416912,23857434,0,0,1,0,0,0,1,COEXISTS_WITH,C1413909,C1413914,1.000000
28416913,23857435,0,0,1,0,0,0,1,PROCESS_OF,C4023614,C1413909,1.000000
28416914,23857435,0,0,1,0,0,0,1,PROCESS_OF,C4023614,C1413909,1.000000
28416915,23857436,0,0,1,0,0,0,1,PROCESS_OF,C4023614,C1413914,1.000000


In [49]:
probas.to_csv(join('all_triples', 'origin_probabilities_with_count.csv'), index = False, compression = 'gzip')

In [50]:
del probas

In [3]:
probas = get_fact_percentage()

In [4]:
probas.to_csv(join('all_triples', 'origin_probabilities_with_perc.csv'), index = False, compression = 'gzip')

## Origin Factuality Aggregation

In [15]:
def gen_proba(origin_facts_df):
    fact_proba_scores = {'Fact':1,'Probable':0.75,
                   'Possible':0.5,   'Doubtful':0.25,
                   'Counterfact':0, 'Uncommitted':0, 'Conditional':0}
    origin_facts_df['label_proba'] = origin_facts_df['label'].map(fact_proba_scores)
    return origin_facts_df.groupby("ORIGIN_ID")[["label_proba"]].agg(lambda x: x.astype(float).mean())


In [7]:
sents = pd.read_csv(join('all_triples', 'origin_factuality.csv'), compression = 'gzip')

In [8]:
sents.head()

Unnamed: 0,label,ORIGIN_ID
0,Fact,556236
1,Fact,2023646
2,Fact,2023646
3,Fact,2273984
4,Fact,55837


In [9]:
sents.shape

(113805277, 2)

In [12]:
sents['label'].unique()

array(['Fact', 'Uncommitted', 'Possible', 'Counterfact', 'Probable',
       'Doubtful'], dtype=object)

In [16]:
probas = gen_proba(origin_facts_df = sents)

In [23]:
probas.head()

Unnamed: 0_level_0,label_proba
ORIGIN_ID,Unnamed: 1_level_1
0,1.0
1,1.0
2,1.0
3,1.0
4,0.989018


In [18]:
probas['label_proba'].min()

0.0

In [19]:
probas['label_proba'].max()

1.0

In [20]:
probas['label_proba'].mean()

0.9110480193979422

In [22]:
np.sort(probas['label_proba'].unique())

array([0.        , 0.00342466, 0.00884956, ..., 0.99986552, 0.99986821,
       1.        ])

In [25]:
probas.reset_index().to_csv(join('all_triples', 'origin_probabilities.csv'), index = False, compression = 'gzip')

## Triple Probabilities

In [31]:
def get_triple_probas():
    print('Origin probas')
    probas = pd.read_csv(join('all_triples', 'origin_probabilities.csv'), compression = 'gzip')
    print('Canonized triples')
    can_triples = pd.read_csv(join('all_triples', 'all_data_triples_can.csv'), compression = 'gzip')
    print('Join and Return')
    return can_triples.set_index('ORIGIN_ID').join(probas.set_index('ORIGIN_ID'), how='inner').reset_index()


In [32]:
probas = get_triple_probas()

Origin probas
Canonized triples
Join and Return


In [33]:
probas.head()

Unnamed: 0,ORIGIN_ID,PREDICATE,SUBJECT_CUI,OBJECT_CUI,label_proba
0,0,PROCESS_OF,C0003725,C0999630,1.0
1,1,ISA,C0039258,C0446169,1.0
2,2,ISA,C0318627,C0206590,1.0
3,3,ISA,C0446169,C0003725,1.0
4,4,PROCESS_OF,C0012634,C0020114,0.989018


In [36]:
np.sort(probas['label_proba'].unique())

array([0.        , 0.00342466, 0.00884956, ..., 0.99986552, 0.99986821,
       1.        ])

In [37]:
probas.to_csv(join('all_triples', 'triples_probabilities.csv'), index = False, compression = 'gzip')

## Tests

In [3]:
sents = pd.read_csv(join('all_triples','sentence_index.csv'), compression = 'gzip')

In [4]:
sents.head()

Unnamed: 0,f_name,SENTENCE_ID
0,split_00,6
1,split_00,7
2,split_00,8
3,split_00,9
4,split_00,10


In [5]:
sents = pd.read_csv(join('all_triples','origin_predication.csv'), compression = 'gzip')

In [6]:
sents.head()

Unnamed: 0,ORIGIN_ID,PREDICATION_ID,SENTENCE_ID
0,1775772,76299252,10378470
1,1302567,74851732,24206336
2,1737760,76177799,26954710
3,380736,72444077,19247628
4,29711,15350637,84375


In [7]:
sents = pd.read_csv(join('PREDICATION_AUX','all_combo_index_v2.csv'), compression = 'gzip')

In [8]:
sents.head()

Unnamed: 0,PREDICATION_AUX_ID,PREDICATION_ID,SENTENCE_ID,file_name
0,10592600,10592604,16,SENTENCE/split_00.csv.gz
1,10592679,10592697,17,SENTENCE/split_00.csv.gz
2,10592713,10592728,17,SENTENCE/split_00.csv.gz
3,10592749,10592759,17,SENTENCE/split_00.csv.gz
4,10592816,10592832,18,SENTENCE/split_00.csv.gz


In [4]:
sents = pd.read_csv(join('all_triples','all_data_triples_can.csv'), compression = 'gzip')

In [5]:
sents.head()

Unnamed: 0,PREDICATE,ORIGIN_ID,SUBJECT_CUI,OBJECT_CUI
0,PROCESS_OF,0,C0003725,C0999630
1,ISA,1,C0039258,C0446169
2,ISA,2,C0318627,C0206590
3,ISA,3,C0446169,C0003725
4,PROCESS_OF,4,C0012634,C0020114
