https://www.fahdmirza.com/2024/08/free-llm-dataset-creation-with-ollama.html

## Prepare llama model to use

In [1]:
import ollama

In [2]:
def make_llama_3_prompt(user, system="", assistant=""):
    system_prompt = ""
    if system:
        system_prompt = (
            f"<|start_header_id|>system<|end_header_id|>\n\n{system}<|eot_id|>"
        )
    
    user_prompt = f"<|start_header_id|>user<|end_header_id|>\n\n{user}<|eot_id|>"
    assistant_prompt = f"<|start_header_id|>assistant<|end_header_id|>\n\n{assistant}<|eot_id|>" if assistant else "<|start_header_id|>assistant<|end_header_id|>\n\n"
    
    return f"<|begin_of_text|>{system_prompt}{user_prompt}{assistant_prompt}"

## Prepare sentences to paraphrase

In [3]:
import json
import os
import nltk

In [None]:
path = "..//data//external//cadec//text"
dir_list = os.listdir(path)
dir_list = sorted(dir_list, key = lambda x: (x.split('.')[0], int(x.split('.')[1])))
len(dir_list)

1250

In [5]:
def long_sentence(s):
    return len(s.split()) >= 0

In [6]:
def process_text_from_file(path, file_name):

    file_text = []

    with open(f'{path}//{file_name}', 'r') as file:
        one_line_file = file.read().replace("\n", " ")
        
        for sentence in nltk.sent_tokenize(one_line_file):
            file_text.append(sentence)

    return file_text

### NLTK sentence tokenizer

In [8]:
import nltk

In [None]:
files_text= []

for f in dir_list[:10]:
    with open(f'{path}//{f}', 'r') as file:
        one_line_file = file.read().replace("\n", " ")
        for sentence in nltk.sent_tokenize(one_line_file):
            files_text.append(sentence)

sentences = list(filter(long_sentence, files_text))
print(f"Number of sentences: {len(files_text)}")
print(f"Number of filtered sentences: {len(sentences)}\n")

for s in sentences:
    print(s)

Number of sentences: 42
Number of filtered sentences: 38

I feel a bit drowsy & have a little blurred vision, so far no gastric problems.
I've been on Arthrotec 50 for over 10 years on and off, only taking it when I needed it.
Due to my arthritis getting progressively worse, to the point where I am in tears with the agony, gp's started me on 75 twice a day and I have to take it.
every day for the next month to see how I get on, here goes.
So far its been very good, pains almost gone, but I feel a bit weird, didn't have that when on 50.
Brilliant, I have a new lease of life, i walk up & down steps properly, no longer sideways like a toddler, hip pain as gone other than if i jar it.
no side effects for the first two months .
then vaginal bleeding 2 wks after menstral cycle.
canker sores in my mouth.
been off for 1 week still have bleeding.
helped my pain alot .
too scared to take this drug again.
1st pill taken with food, a few hours after i experienced shortness of breath, a sense of de

### Spacy sentence tokenizer

In [9]:
import spacy
from spacy.lang.en import English

In [None]:
files_text= []

for f in dir_list[:10]:
    with open(f'{path}//{f}', 'r') as file:
        one_line_file = file.read().replace("\n", " ")

        nlp = English()
        nlp.add_pipe("sentencizer")

        for sentence in nlp(one_line_file).sents:
            files_text.append(str(sentence))

sentences = list(filter(long_sentence, files_text))

print(f"Number of sentences: {len(files_text)}")
print(f"Number of filtered sentences: {len(sentences)}\n")

for s in sentences:
    print(s)

Number of sentences: 43
Number of filtered sentences: 38

I feel a bit drowsy & have a little blurred vision, so far no gastric problems.
I've been on Arthrotec 50 for over 10 years on and off, only taking it when I needed it.
Due to my arthritis getting progressively worse, to the point where I am in tears with the agony, gp's started me on 75 twice a day and I have to take it.
every day for the next month to see how I get on, here goes.
So far its been very good, pains almost gone, but I feel a bit weird, didn't have that when on 50.
Brilliant, I have a new lease of life, i walk up & down steps properly, no longer sideways like a toddler, hip pain as gone other than if i jar it.
no side effects for the first two months .
then vaginal bleeding 2 wks after menstral cycle.
canker sores in my mouth.
been off for 1 week still have bleeding.
helped my pain alot .
too scared to take this drug again.
1st pill taken with food, a few hours after i experienced shortness of breath, a sense of de

### LLM sentence tokenizer

In [11]:
def tokenize_sentences(text):
    system = "You are an english text editor with 10 years of experience in splitting text into sentences. \n"

    user = "Break following text into sentences: " + text + "\n"
    user += "After each sentence add new line symbol: '\\n' i.e. \n"
    user += " This is a first sentence.\\nThis is a second sentence.\\n \n"
   
    user += """\
            Make sure that the only change in original text you make is adding new line symbols in places when each sentence ends. \
            Don't give any comments. Just return sentences with added new line symbol at the end of each sentence. \
            Make sure returned text differs from the original text only by new line symbols. \
            Keep in mind that original text is written by someone who makes mistakes. Not every sentence in original text starts with capital letter and ends with dot.\
            Keep in mind that original text is written by someone who makes mistakes. In original text there might be some extra dots, that do not mean end of the sentence.\
            Make sure the only change to the original text you did is adding new line symbols.
            """
    system += text
    
    prompt = make_llama_3_prompt(user, system)

    # Generate the result from the model
    result = ollama.generate(model='llama3.2', prompt=prompt)

    # Inspect and parse the result['response']
    response_str = result['response']

    return response_str

In [None]:
files_text= []

for f in dir_list[:10]:
    with open(f'{path}//{f}', 'r') as file:
        one_line_file = file.read().replace("\n", " ")
        
        for sentence in tokenize_sentences(one_line_file).splitlines():
            files_text.append(sentence)

sentences = list(filter(long_sentence, files_text))
print(f"Number of sentences: {len(files_text)}")
print(f"Number of filtered sentences: {len(sentences)}\n")

for s in sentences:
    print(s)

Number of sentences: 59
Number of filtered sentences: 39

I feel a bit drowsy & have a little blurred vision, so far no gastric problems.
I've been on Arthrotec 50 for over 10 years on and off, only taking it when I needed it.
Due to my arthritis getting progressively worse, to the point where I am in tears with the agony, gp's started me on 75 twice a day and I have to take it. every day for the next month to see how I get on, here goes.
So far its been very good, pains almost gone, but I feel a bit weird, didn't have that when on 50.
Brilliant, I have a new lease of life, 
I walk up & down steps properly, no longer sideways like a toddler,
Hip pain as gone other than if i jar it.
no side effects for the first two months . 
then vaginal bleeding 2 wks after menstral cycle. 
canker sores in my mouth. 
been off for 1 week still have bleeding. 
helped my pain alot . 
too scared to take this drug again.
1st pill taken with food, a few hours after i experienced shortness of breath, a sense

### Final choice for sentence tokenization

In [9]:
files_text= []

for f in dir_list[:]:
    with open(f'{path}//{f}', 'r') as file:
        one_line_file = file.read().replace("\n", " ")
        for sentence in nltk.sent_tokenize(one_line_file):
            files_text.append(sentence)

sentences = list(filter(long_sentence, files_text))
print(f"Number of sentences: {len(files_text)}")
print(f"Number of filtered sentences: {len(sentences)}\n")

KeyboardInterrupt: 

## Write prompt

In [7]:
def generate_paraphrase(sentence):
    system = "You are an english text editor with 10 years of experience in paraphrasing sentences. \n"
    system += "Consider the following sentences and their paraphrases:\n"
    
    sentence_1 = "I was prescribed a medication for bone and joint pain associated with Lupus."
    paraphrase_1 = "I was prescribed a drug to treat the joint and bone ache caused by lupus."
    
    sentence_2 = "I have had no recognizable side effects."
    paraphrase_2 = "I have not experienced any noticeable side effects."

    sentence_3 = "The drug almost completely wiped out the pain from the herniated discs, and has replaced patients' usual narcotic pain relief for back."
    paraphrase_3 = "The drug has almost completely eliminated the pain caused by the herniated discs, providing an alternative to the usual narcotic pain relief for back pain."
    
    system += "Sentence: " + sentence_1 + "\n"
    system += "Paraphrase: " + paraphrase_1 + "\n"
    system += "Sentence: " + sentence_2 + "\n"
    system += "Paraphrase: " + paraphrase_2 + "\n"
    system += "Sentence: " + sentence_3 + "\n"
    system += "Paraphrase: " + paraphrase_3 + "\n"
    
    user = "Based on the above examples, write a paraphrase of the following sentence: " + sentence + "\n"
    user += "Format the sentence and paraphrase as a JSON object, i.e.\n"
    user += '{"sentence" : str, "paraphrase": str }.\n'
   
    user += """\
            Make sure to only return a well-written paraphrase of the given sentence. \
            Don't give any comments. Just return a sentence and a paraphrase as JSON objects. \
            Make sure paraphrase has the same meaning as the sentence. \
            Make sure all medical entities from original sentence are also present in the paraphrase (in original or paraphrased form e.g 'ache' instead of 'pain'). \
            Make sure there are no medical entities in paraphrase, that are not present in the original sentence.
            """
    system += sentence
    
    prompt = make_llama_3_prompt(user, system)

    # Generate the result from the model
    result = ollama.generate(model='llama3.2', prompt=prompt)

    return result

## Run prompt

In [8]:
def save_to_jsonl(data, file_path):
    with open(file_path, 'w+') as f:
        for entry in data:
            f.write(json.dumps(entry) + '\n')

In [9]:
def save_to_text(data, file_path):
    with open(file_path, 'w+') as f:
        for entry in data:
            f.write(entry['paraphrase'] + '\n')

In [10]:
for n, file_name in enumerate(dir_list):

    if n < 1063: continue

    print(f'Processing file {file_name}\t [{n}/{len(dir_list)}]')
    sentences = process_text_from_file(path, file_name)
    sentence_paraphrases = []   # Keep all paraphrases of sentences from one file
    
    for s in sentences:

        response_dict = {}
        # Try to generate responses as long as it will be in correct json format
        while not response_dict:
            result = generate_paraphrase(s)

            # Inspect and parse the result['response']
            response_str = result['response']
            try:
                response_dict = json.loads(response_str)
            except json.JSONDecodeError as e:
                print("Failed to parse response as JSON:", e)
            
        sentence_paraphrases.append(response_dict)

    output_json_file_path = f'../data/processed/cadec/combined/{file_name}.jsonl'
    output_text_file_path = f'../data/processed/cadec/text/{file_name}.txt'

    save_to_jsonl(sentence_paraphrases, output_json_file_path)
    save_to_text(sentence_paraphrases, output_text_file_path)

Processing file LIPITOR.894.txt	 [1063/1250]
Processing file LIPITOR.895.txt	 [1064/1250]
Processing file LIPITOR.896.txt	 [1065/1250]
Processing file LIPITOR.897.txt	 [1066/1250]
Processing file LIPITOR.898.txt	 [1067/1250]
Processing file LIPITOR.899.txt	 [1068/1250]
Processing file LIPITOR.900.txt	 [1069/1250]
Processing file LIPITOR.901.txt	 [1070/1250]
Processing file LIPITOR.902.txt	 [1071/1250]
Processing file LIPITOR.903.txt	 [1072/1250]
Processing file LIPITOR.904.txt	 [1073/1250]
Processing file LIPITOR.905.txt	 [1074/1250]
Processing file LIPITOR.906.txt	 [1075/1250]
Processing file LIPITOR.907.txt	 [1076/1250]
Processing file LIPITOR.908.txt	 [1077/1250]
Processing file LIPITOR.909.txt	 [1078/1250]
Processing file LIPITOR.910.txt	 [1079/1250]
Processing file LIPITOR.911.txt	 [1080/1250]
Processing file LIPITOR.912.txt	 [1081/1250]
Processing file LIPITOR.913.txt	 [1082/1250]
Processing file LIPITOR.914.txt	 [1083/1250]
Processing file LIPITOR.915.txt	 [1084/1250]
Processing

In [None]:
all_paraphrases = []

for s in sentences:
    
    paraphrase = generate_paraphrase(s)
    all_paraphrases.append(paraphrase)

Failed to parse response as JSON: Expecting ',' delimiter: line 1 column 95 (char 94)
Failed to parse response as JSON: Invalid \escape: line 1 column 166 (char 165)
Failed to parse response as JSON: Extra data: line 2 column 1 (char 180)
Failed to parse response as JSON: Extra data: line 2 column 1 (char 128)
Failed to parse response as JSON: Expecting property name enclosed in double quotes: line 1 column 96 (char 95)
Failed to parse response as JSON: Extra data: line 1 column 148 (char 147)
Failed to parse response as JSON: Invalid \escape: line 1 column 61 (char 60)
Failed to parse response as JSON: Invalid \escape: line 1 column 32 (char 31)
Failed to parse response as JSON: Extra data: line 2 column 1 (char 161)
Failed to parse response as JSON: Expecting ':' delimiter: line 1 column 87 (char 86)
Failed to parse response as JSON: Extra data: line 2 column 1 (char 76)
Failed to parse response as JSON: Extra data: line 2 column 1 (char 148)
Failed to parse response as JSON: Invalid

In [None]:
all_paraphrases

[{'sentence': 'I feel a bit drowsy & have a little blurred vision, so far no gastric problems.',
  'paraphrase': 'So far, I am experiencing some dizziness and slight double vision, with no signs of stomach issues.'},
 {'sentence': "I've been on Arthrotec 50 for over 10 years on and off, only taking it when I needed it.",
  'paraphrase': 'I have relied on Arthrotec 50 sporadically for over a decade.'},
 {'sentence': "Due to my arthritis getting progressively worse, to the point where I am in tears with the agony, gp's started me on 75 twice a day and I have to take it.",
  'paraphrase': 'Because of worsening arthritis, which is causing me extreme distress, doctors prescribed me 75 mg twice a day, which I must now ingest regularly.'},
 {'sentence': 'every day for the next month to see how I get on, here goes.',
  'paraphrase': 'I will commit to checking my progress every day for the next month and proceed as planned.'},
 {'sentence': "So far its been very good, pains almost gone, but I f

## Save output to json lines

In [None]:
def save_to_jsonl(data, file_path):
    with open(file_path, 'a') as f:
        for entry in data:
            f.write(json.dumps(entry) + '\n')

In [None]:
output_file_path = '../data/processed/paraphrases_06_25.jsonl'

save_to_jsonl(all_paraphrases, output_file_path)
print(f"Saved {len(all_paraphrases)} questions and queries to {output_file_path}")

Saved 40 questions and queries to ../data/processed/paraphrases.jsonl
