# Helping functions and Imports

## IO Utils

In [None]:
"""
Contains some utils functions for io operations.
"""

import json
import os
import pickle
import csv
import requests


def path_exits(path):
    return os.path.exists(path)


def mkdir(path):
    if not path_exits(path):
        os.mkdir(path)


def makedirs(path):
    if not path_exits(path):
        os.makedirs(path)


def list_files_in_dir(dir):
    return [file for file in os.listdir(dir) if is_file(join(dir, file))]


def list_directories(dir):
    return [subdir for subdir in os.listdir(dir) if os.path.isdir(join(dir, subdir))]


def is_file(path):
    return os.path.isfile(path)


def is_dir(path):
    return os.path.isdir(path)


def join(path1, path2):
    return os.path.join(path1, path2)


def write_json(path, dict):
    with open(path, 'w') as outfile:
        json.dump(dict, outfile, indent=2)
    outfile.close()


def read_json(path):
    with open(path, "r") as infile:
        data = json.load(infile)
    infile.close()
    return data


def read_file_into_list(input_file):
    lines = []
    with open(input_file, "r") as infile_fp:
        for line in infile_fp.readlines():
            lines.append(line.strip())
    infile_fp.close()
    return lines


def write_list_to_file(output_file, list):
    with open(output_file, "w") as outfile_fp:
        for line in list:
            outfile_fp.write(line + "\r\n")
    outfile_fp.close()


def write_text_to_file(output_file, text):
    with open(output_file, "w") as output_fp:
        output_fp.write(text)
    output_fp.close()


def write_pickle(data, file_path):
    pickle.dump(data, open(file_path, "wb"))


def read_pickle(file_path):
    return pickle.load(open(file_path, 'rb'))


def write_to_csv(filepath, header, rows, delimiter=','):
    with open(filepath, 'w', encoding='UTF8', newline='') as output_fp:
        writer = csv.writer(output_fp, delimiter=delimiter)
        writer.writerow(header)
        writer.writerows(rows)
    output_fp.close()

def get_request_content(end_point, params):
    get_response = requests.get(end_point, params=params)
    return get_response.json()


def download_file(url, download_path):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:55.0) Gecko/20100101 Firefox/55.0',
    }
    r = requests.get(url, stream=True, allow_redirects=True, headers=headers)
    with open(download_path, 'wb') as f:
        for ch in r:
            f.write(ch)

def write_dict_to_csv(filepath, fields, dict):
  with open(filepath, 'w') as csvfile:
    # creating a csv dict writer object
    writer = csv.DictWriter(csvfile, fieldnames=fields)

    # writing headers (field names)
    writer.writeheader()

    # writing data rows
    writer.writerows(dict)

# Input and output information
Include any paths required for input or output

In [None]:
# https://github.com/dwadden/multivers
# TODO: set the correct path
multivers_root_dir = "/path/to/multivers"

## Manually Collected papers

#### Interviews

In [None]:
# /path/to/nkd_llm_2024/: https://anonymous.4open.science/r/nkd_llm_2024-07E6
# TODO: set the correct path
interviews_papers_base_dir = "/path/to/nkd_llm_2024/data/interviews"

In [None]:
interviews_claims_base_path = join(interviews_papers_base_dir, 'claims')

In [None]:
interviews_corpus_dir = join(interviews_papers_base_dir, 'corpus')

In [None]:
interviews_predictions_base_path = join(interviews_papers_base_dir, 'predictions')
makedirs(interviews_predictions_base_path)

### Surveys

In [None]:
# /path/to/nkd_llm_2024/: https://anonymous.4open.science/r/nkd_llm_2024-07E6
# TODO: set the correct path
surveys_papers_base_dir = "/path/to/nkd_llm_2024/data/surveys"

In [None]:
surveys_claims_base_path = join(surveys_papers_base_dir, 'claims')

In [None]:
surveys_corpus_dir = join(surveys_papers_base_dir, 'corpus')

In [None]:
surveys_predictions_base_path = join(surveys_papers_base_dir, 'predictions')
makedirs(surveys_predictions_base_path)

# The Predictions commands

## Install the libraries
this needes to be done every time we re-connect because the installed libraries are lost after the disconnect

In [None]:
!apt-get install python3.8

In [None]:
!apt-get install python3.8-distutils

In [None]:
!sudo update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 1

In [None]:
!update-alternatives --config python3

In [None]:
!python3 --version

In [None]:
!sudo apt install python3-pip

In [None]:
%cd $multivers_root_dir

/content/drive/MyDrive/CSED/PhD/Implementation/Scientific_Articles_KD/fact_checking/multivers


In [None]:
!pip install -r requirements.txt --default-timeout=100

Collecting Cython==0.29.21
  Downloading Cython-0.29.21-cp38-cp38-manylinux1_x86_64.whl (1.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m30.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets==1.1.3
  Downloading datasets-1.1.3-py3-none-any.whl (153 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m153.7/153.7 KB[0m [31m23.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting numpy==1.19.4
  Downloading numpy-1.19.4-cp38-cp38-manylinux2010_x86_64.whl (14.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.5/14.5 MB[0m [31m71.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pandas==1.1.5
  Downloading pandas-1.1.5-cp38-cp38-manylinux1_x86_64.whl (9.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.3/9.3 MB[0m [31m100.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pytorch-lightning==1.2.1
  Downloading pytorch_lightning-1.2.1-py3-none-any.whl (814 kB)
[2K     [90m━━━━━━━━━━

## Predictions

### Run

In [None]:
def split_corpus(corpus_file_path, splited_corpus_path):
  doc_ids__splits__map = {}
  splits__doc_ids__map = {}
  lines = read_file_into_list(corpus_file_path)
  splited_lines = []
  for line in lines:
    line = json.loads(line)
    doc_id = line['doc_id']
    title = line['title']
    doc_ids__splits__map[doc_id] = []

    word_count = 0
    split_count = 0
    split = []
    for index, sent in enumerate(line['abstract']):
      word_count += len(sent.split())
      split.append(sent)

      if word_count >= 500 or index == len(line['abstract']) - 1:
        split_id = int(doc_id * 10 + split_count)
        splited_lines.append({"doc_id": split_id, "title": title, "abstract": split})
        doc_ids__splits__map[doc_id].append(split_id)
        splits__doc_ids__map[split_id] = doc_id
        word_count = 0
        split_count += 1
        split = []

  write_list_to_file(splited_corpus_path, [json.dumps(line) for line in splited_lines])
  return doc_ids__splits__map, splits__doc_ids__map


In [None]:
def convert_claims_docids_to_splitid(doc_ids__splits__map, claims_file_path, split_claims_file_path):
  lines = read_file_into_list(claims_file_path)
  splited_lines = []
  for line in lines:
    line = json.loads(line)
    split_doc_ids = []
    for doc_id in line['doc_ids']:
      split_doc_ids.extend(doc_ids__splits__map[doc_id])
    splited_lines.append({"id": line['id'], "claim": line['claim'], "doc_ids": split_doc_ids})

  write_list_to_file(split_claims_file_path, [json.dumps(line) for line in splited_lines])

In [None]:
def run_prediction(claims_file_path, corpus_file_path, prediction_path):
  !python3 multivers/predict.py --checkpoint_path=checkpoints/scifact.ckpt --input_file=$claims_file_path --corpus_file=$corpus_file_path --output_file=$prediction_path

In [None]:
def run_abstract_prediction(claims_dir, abstract_corpus_file_path, prediction_dir):
  datasets_claims_file_path= join(claims_dir, 'datasets_claims.jsonl')
  approach_claims_file_path= join(claims_dir, 'approach_claims.jsonl')
  keydifferences_claims_file_path= join(claims_dir, 'split_keydifferences_claims.jsonl')

  print('running pridections for', prediction_dir)
  datasets_abstract_prediction_path = join(prediction_dir, 'datasets_abstract.jsonl')
  approach_abstract_prediction_path= join(prediction_dir, 'approach_abstract.jsonl')
  keydifferences_abstract_prediction_path= join(prediction_dir, 'keydifferences_abstract.jsonl')

  run_prediction(datasets_claims_file_path, abstract_corpus_file_path, datasets_abstract_prediction_path)
  run_prediction(approach_claims_file_path, abstract_corpus_file_path, approach_abstract_prediction_path)
  run_prediction(keydifferences_claims_file_path, abstract_corpus_file_path, keydifferences_abstract_prediction_path)

In [None]:
def run_long_prediction(claims_dir, corpus_file_path, prediction_dir, corpus_type):
  datasets_claims_file_path= join(claims_dir, 'datasets_claims.jsonl')
  approach_claims_file_path= join(claims_dir, 'approach_claims.jsonl')
  keydifferences_claims_file_path= join(claims_dir, 'split_keydifferences_claims.jsonl')
  print('running pridections for', prediction_dir)

  datasets_prediction_path = join(prediction_dir, f'datasets_{corpus_type}.jsonl')
  approach_prediction_path= join(prediction_dir, f'approach_{corpus_type}.jsonl')
  keydifferences_prediction_path = join(prediction_dir, f'keydifferences_{corpus_type}.jsonl')

  splited_corpus_path = corpus_file_path[:-6] + '_split.jsonl'
  doc_ids__splits__map, splits__doc_ids__map = split_corpus(corpus_file_path, splited_corpus_path)
  split_datasets_claims_file_path = datasets_claims_file_path[:-6] + f'_{corpus_type}_split.jsonl'
  convert_claims_docids_to_splitid(doc_ids__splits__map, datasets_claims_file_path, split_datasets_claims_file_path)
  run_prediction(split_datasets_claims_file_path, splited_corpus_path, datasets_prediction_path)

  split_approach_claims_file_path = approach_claims_file_path[:-6] + f'_{corpus_type}_split.jsonl'
  convert_claims_docids_to_splitid(doc_ids__splits__map, approach_claims_file_path, split_approach_claims_file_path)
  run_prediction(split_approach_claims_file_path, splited_corpus_path, approach_prediction_path)


  split_keydifferences_claims_file_path = keydifferences_claims_file_path[:-6] + f'_{corpus_type}_split.jsonl'
  convert_claims_docids_to_splitid(doc_ids__splits__map, keydifferences_claims_file_path, split_keydifferences_claims_file_path)
  run_prediction(split_keydifferences_claims_file_path, splited_corpus_path, keydifferences_prediction_path)

In [None]:
def run_key_differences_prediction(claims_dir, corpus_file_path, prediction_dir):
  raw_keydifferences_claims_file_path = join(claims_dir, 'raw_keydifferences_claims.jsonl')
  raw_keydifferences_prediction_path= join(prediction_dir, 'raw_keydifferences.jsonl')

  # prediction difference
  run_prediction(raw_keydifferences_claims_file_path, corpus_file_path, raw_keydifferences_prediction_path)

#### Interviews

In [None]:
for gpt_model in list_directories(interviews_claims_base_path):
  for prompt_version in list_directories(join(interviews_claims_base_path, gpt_model)):
    for run in list_directories(join(interviews_claims_base_path, f'{gpt_model}/{prompt_version}')):
      print(f'running prediction for gpt {gpt_model}, prompt {prompt_version} and run {run}')
      run_sub_dir = f'{gpt_model}/{prompt_version}/{run}'
      makedirs(join(interviews_predictions_base_path, run_sub_dir))
      run_key_differences_prediction(join(interviews_claims_base_path, run_sub_dir), join(interviews_corpus_dir, 'difference.jsonl'), join(interviews_predictions_base_path, run_sub_dir))

running prediction for prompt 3 and run 1
100% 26/26 [00:06<00:00,  3.80it/s]
running prediction for prompt 3 and run 2
100% 26/26 [00:06<00:00,  3.93it/s]
running prediction for prompt 3 and run 3
100% 26/26 [00:06<00:00,  4.14it/s]
running prediction for prompt 3 and run 4
100% 26/26 [00:06<00:00,  3.97it/s]
running prediction for prompt 3 and run 5
100% 26/26 [00:06<00:00,  3.91it/s]


#### surveys

In [None]:
for gpt_model in list_directories(surveys_claims_base_path):
  for prompt_version in list_directories(join(surveys_claims_base_path, gpt_model)):
    for run in list_directories(join(surveys_claims_base_path, f'{gpt_model}/{prompt_version}')):
      print(f'running prediction for gpt {gpt_model}, prompt {prompt_version} and run {run}')
      run_sub_dir = f'{gpt_model}/{prompt_version}/{run}'
      makedirs(join(surveys_predictions_base_path, run_sub_dir))
      run_key_differences_prediction(join(surveys_claims_base_path, run_sub_dir), join(surveys_corpus_dir, 'difference.jsonl'), join(surveys_predictions_base_path, run_sub_dir))