# Dataset creation for SFT and continued pretraining
From EAGE abstracts released for the Annual Hackathon in 2023

In [None]:
!pip install gdown --quiet

In [None]:

!pip install langchain --quiet
!pip install langchain_nvidia_ai_endpoints --quiet
!pip install pypdf --quiet

In [None]:
!nvidia-smi

## Imports

In [None]:
# import the relevant libraries
import json
import os

import tqdm
from langchain_nvidia_ai_endpoints import ChatNVIDIA
from langchain.text_splitter import  RecursiveCharacterTextSplitter
from multiprocessing import Pool
from pypdf import PdfReader

In [None]:
%load_ext cudf.pandas
import pandas as pd

In [None]:

from pprint import PrettyPrinter
pprint = PrettyPrinter(indent=4).pprint
# os.environ['NVIDIA_API_KEY'] = "<YOUR NVIDIA API KEY HERE>"

## Download and extract documents

In [None]:
!pwd

In [None]:
!ls /

In [None]:
# !gdown 1HmxAZerbIQfHo3evhys1nGX2lspda1RX -O /workspace/data/documents.zip

In [None]:
# !unzip -o /workspace/data/documents.zip -d /workspace/data/

## Extract raw texts

In [None]:
df = pd.read_csv('workspace/Norway - Diskos reports.csv', sep=',')

Populate .jsonl from extracted .pdf files

In [None]:
os.makedirs("/workspace/local_data/raw", exist_ok=True)

for document in df['filename'].unique():
    with open(f"/workspace/local_data/raw/{document}.jsonl", "w") as f:
        for raw in df[df['filename'] == document]['content']:
            f.write(json.dumps({"text": raw}) + "\n")

In [None]:
!head /workspace/data/raw/.jsonl

## Cleaning raw documents with NeMo Curator

In [None]:
!text_cleaning --help

In [None]:
!text_cleaning --input-data-dir /workspace/local_data/raw --output-clean-dir /workspace/local_data/clean

In [None]:
!pip install --extra-index-url https://pypi.nvidia.com nemo-curator[cuda12x]

In [None]:
import nemo_curator as nc
from nemo_curator.datasets import DocumentDataset
from nemo_curator.utils.file_utils import get_all_files_paths_under
from nemo_curator.filters import WordCountFilter
from nemo_curator.modifiers import UnicodeReformatter

files = get_all_files_paths_under("/workspace/local_data/clean/")
documents = DocumentDataset.read_json(files, add_filename=True)

filter_step = nc.ScoreFilter(
                WordCountFilter(min_words=80),
                text_field="text",
                score_field="word_count",
            )

filtered_documents = filter_step(documents)

cleaner = nc.Modify(UnicodeReformatter())
filtered_documents = cleaner(filtered_documents)

filtered_documents.to_json("/workspace/local_data/curator/", write_to_filename=True)

## Preparing data for continuous pretraining
Creating `*.idx` and `*.bin` files

In [None]:
!python /opt/NeMo/scripts/nlp_language_modeling/preprocess_data_for_megatron.py \
    --input /workspace/local_data/clean/* \
    --json-keys text \
    --tokenizer-library sentencepiece \
    --tokenizer-model /workspace/models/Llama-2-7b-chat-hf/tokenizer.model \
    --output-prefix /workspace/local_data/clean/hackathon

In [None]:
!ls -l /workspace/local_data/clean | wc -l

## Preparing data for LLM tuning

Split text into overlapping chunks

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=1500)

with open("/workspace/local_data/clean/documents.jsonl", "r") as f:
    documents = [json.loads(line)["text"] for line in f.readlines()]
document_chunks = [text_splitter.split_text(document) for document in documents]
document_chunks_flat = [chunk for chunks in document_chunks for chunk in chunks]
print(f'{len(document_chunks_flat)} chunks extracted out of {len(documents)} pdf documents')

Given the paragraph after <INPUT_START> tag, create a very good geoscience-related question and answer pair. Your output should be in a .json format containing the following fields: ['question', 'answer']
Restrict the question to the context information provided. The questions should use information from passage, but should not refer to the originating text implicitly (you can not use 'according to', 'based on', and similar).
Respond only with .json output, add no other comments. If generating a good question and answer pair is not possible, output <skip> instead.
<INPUT_START>

Define LLM and prompt

In [None]:
INSTRUCTION_PROMPT = """Given the paragraph after <INPUT_START> tag, create a very good geoscience-related question and answer pair. Your output should be in a .json format containing the following fields: ['question', 'answer']
Restrict the question to the context information provided. The questions should use information from passage, but should not refer to the originating text implicitly (you can not use 'according to', 'based on', and similar).
Respond only with .json output, add no other comments. If generating a good question and answer pair is not possible, output <skip> instead.
<INPUT_START>"""
# CHUNKS_TO_PROCESS = 10
CHUNKS_TO_PROCESS = None # means all

llm = ChatNVIDIA(
    model="ai-llama3-70b",
    temperature=0.2,
    max_tokens=256
)

Submit batched requests to the LLM

**WARNING! It will take ±30 min to generate QA paris**

In [None]:
# qa_pairs = await llm.abatch(['\n'.join([INSTRUCTION_PROMPT, chunk]) for chunk in document_chunks_flat[:CHUNKS_TO_PROCESS]], 
#                             config={"max_concurrency": 10})
# qa_pairs = [qa_pair.content for qa_pair in qa_pairs if qa_pair.content != "<skip>"]

In [None]:
pprint(qa_pairs[0])

Parse generated QA pairs in tovalid document

In [None]:
failed_count = 0
warning_count = 0
with open("/workspace/data/clean/documents_sft.jsonl", "w") as f:
    for qa_pair in qa_pairs:
        # Checking if json is correct
        try:
            json.loads(qa_pair)
        except json.JSONDecodeError:
            print(f'Failed to read {qa_pair} as a valid JSON')
            failed_count += 1
            continue
        jsonl_line = qa_pair.replace("\n", "").replace('"question":', '"input":').replace('"answer":', '"output":').strip()
        json_line_obj = json.loads(jsonl_line)
        if isinstance(json_line_obj, list):
            print(f'WARNING: {jsonl_line}')
            jsonl_line = json.dumps(json_line_obj[0])
            warning_count += 1
        f.write(jsonl_line + "\n")

print('Done')
print(f'Failed\t{failed_count} / {len(qa_pairs)}')
print(f'Warnings\t{warning_count} / {len(qa_pairs)}')

In [None]:
pprint(jsonl_line)

## Split dataset into train / val / test

In [None]:
import os
import json


def read_and_split(fname: str, out_dir: str):
    # Open the original file
    with open(fname, 'r') as original_file:
        lines = original_file.readlines()

    # Calculate partition sizes
    total_lines = len(lines)
    test_size = int(total_lines * 0.1)
    val_size = int(total_lines * 0.1)
    # The rest goes to the train partition

    print(f'There are {total_lines}--> {test_size}, {val_size}, {total_lines - test_size - val_size}')
    print(f'Iterate over {len(lines)} lines in {fname}')

    with open(os.path.join(out_dir, 'data_test.jsonl'), 'w') as test_file, \
         open(os.path.join(out_dir, 'data_val.jsonl'), 'w') as val_file, \
         open(os.path.join(out_dir, 'data_train.jsonl'), 'w') as train_file:

        # Iterate over each line in the original file
        for i, line in enumerate(lines):
            # Parse JSON data (optional, if you need to manipulate the data)
            json_data = json.loads(line)

            # Convert JSON back to string (if manipulated) or use original line
            # json_line = json.dumps(json_data) if 'manipulate' in locals() else line
            # json_line = str(json.dumps(json_data))
            json_line = line

            # Write to appropriate file based on index
            if i < test_size:
                test_file.write(json_line)
            elif i < test_size + val_size:
                val_file.write(json_line)
            else:
                train_file.write(json_line)

In [None]:
read_and_split('/workspace/data/clean/documents_sft.jsonl', '/workspace/data/')

In [None]:
!head /workspace/data/data_train.jsonl