In [None]:
! pip install --upgrade predictionguard

In [None]:
import os
from getpass import getpass

pg_access_token = getpass('Enter your Prediction Guard access token: ')
os.environ['PREDICTIONGUARD_TOKEN'] = pg_access_token




In [None]:
import predictionguard as pg
!pip install ijson

In [None]:
pg.Completion.list_models()

In [None]:
import json

result = pg.Completion.create(
    model="Nous-Hermes-Llama2-13B",
    prompt="Tell me a joke"
)

print(json.dumps(
    result,
    sort_keys=True,
    indent=4,
    separators=(',', ': ')
))

In [None]:
result = pg.Factuality.check(
    text="The sky is blue",
    reference="The sky is green"
)

print(json.dumps(
    result,
    sort_keys=True,
    indent=4,
    separators=(',', ': ')
))

In [None]:
result = pg.Completion.create(
    model="Nous-Hermes-Llama2-13B",
    prompt="What is the sentiment of this statement: This is great!",
    output={
        "type": "categorical",
        "categories": ["POS", "NEU", "NEG"]
    }
)

print(json.dumps(
    result,
    sort_keys=True,
    indent=4,
    separators=(',', ': ')
))

In [None]:
# #Mount google drive. Run this cell as is
from google.colab import drive
drive.mount('/content/drive')

# #Change working directory. Run this cell as is
os.chdir("/content/drive/My Drive")

# #Training file path. Please change the file path when you run it in your system/colab. Create a folder under My Drive and upload training data there
FilePath="/content/drive/My Drive/transcripts.json"
FilePath2="/content/drive/My Drive/test.csv"

Mounted at /content/drive


In [None]:
# Load the transcripts from the JSON file
with open('transcripts.json', 'r', encoding='utf-8') as file:
    transcripts = json.load(file)

In [None]:
import pandas as pd
# Assuming 'test.csv' is your CSV file
csv_file_path = 'test.csv'

# Create a DataFrame named 'test' from the CSV file
test = pd.read_csv(csv_file_path)

In [None]:
pip install matplotlib

In [None]:
pip install wordcloud

In [None]:
!pip install googletrans==4.0.0-rc1

In [None]:
import matplotlib.pyplot as plt
import json
from wordcloud import WordCloud

def visualize_json(json_file):
    with open(json_file, 'r') as file:
        data = json.load(file)

    # Convert JSON data to a single string for visualization
    json_text = json.dumps(data, indent=2)

    # Create a WordCloud from the JSON text
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(json_text)

    # Plot the WordCloud as a text bubble
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.show()

# Example usage
json_file_path = '/content/drive/My Drive/transcripts.json'
visualize_json(json_file_path)

In [None]:
# Choose the English transcript (e.g., transcript with key '0' as a string)
selected_transcript = transcripts['1']

# Create an LLM "prompt" that we will use to extract information from the transcript.
prompt = f"""### Instruction:
Your task is to parse an JSON containing the medical transcript. This should consist of the following information:
Patient's Name:Look for any mention of the patient's name in the provided text in English, if no prediction return None.
Patient's Age: Extract information about the age of the patient if available in English, if no prediction return None.
Medical Condition:Identify and extract details about the patient's medical condition or health issue in English, if no prediction return None.
Symptoms:Find and list any symptoms mentioned by the patient in English.
Precautions:Look for recommendations or precautions given to the patient for managing their health in English.
Medication/Drug: Identify and extract information about any medication or drug mentioned in the text in English.

### Input:
{selected_transcript}

### Response:
"""

# Use Llama 2 to extract the information.
result = pg.Completion.create(
    model="Nous-Hermes-Llama2-13B",
    prompt=prompt
)

# Print the result
print(json.dumps(
    result,
    sort_keys=True,
    indent=4,
    separators=(',', ': ')
))






In [None]:
import csv
from concurrent.futures import ThreadPoolExecutor
import uuid  # Import the uuid module
from googletrans import Translator  # Import the Translator module

# Function to process a batch of transcripts
def process_batch(transcripts, prompt_template):
    results = []

    for transcript_key, transcript_text in transcripts.items():
        # Create an LLM "prompt" for the current transcript
        prompt = prompt_template.format(transcripts=transcript_text)

        try:
            # Use LLM model to extract information.
            result = pg.Completion.create(
                model="Nous-Hermes-Llama2-13B",
                prompt=prompt,
                max_tokens=200  # Adjust max_tokens as needed
            )

            # Extract specific information from the result
            text_info = result['choices'][0]['text']

            # Add English translation
            translator = Translator()
            english_translation = translator.translate(text_info, src='auto', dest='en').text

            # Split the text into rows
            rows = english_translation.split('\n')

            # Append each row as a dictionary to the results list
            for row in rows:
                # Generate a unique ID for each row using uuid
                row_id = str(uuid.uuid4())
                # Extract information content
                row_content = row.strip()
                results.append({'Id': row_id, 'Text': row_content})

            # Print progress
            print(f"Processed transcript {transcript_key}")

        except Exception as e:
            # Print the exception for debugging
            print(f"Error processing transcript {transcript_key}: {e}")

            # Write None when there is no prediction
            row_id = str(uuid.uuid4())
            results.append({'Id': row_id, 'Text': 'None'})

    return results




# Select the first 100 transcripts for testing
subset_transcripts = {key: transcripts[key] for key in list(transcripts.keys())[:4917]}

# Define the LLM prompt template
prompt_template = """### Instruction:
Your task is to parse a JSON containing the medical transcript. This should consist of the following information:
Patient's Name: Look for any mention of the patient's name in the provided text in English.
Patient's Age: Extract information about the age of the patient if available in English.
Medical Condition: Identify and extract details about the patient's medical condition or health issue in English.
Symptoms: Find and list any symptoms mentioned by the patient in English.
Precautions: Look for recommendations or precautions given to the patient for managing their health in English.
Medication/Drug: Identify and extract information about any medication or drug mentioned in the text in English.

### Input:
{transcripts}

### Response:
"""

# Initialize an empty list to store results
all_results = []

# Use ThreadPoolExecutor for asynchronous processing
with ThreadPoolExecutor() as executor:
    # Process the transcripts concurrently
    results = list(executor.map(lambda transcript: process_batch({transcript: subset_transcripts[transcript]}, prompt_template), subset_transcripts))

    # Flatten the results
    all_results = [result for batch_result in results for result in batch_result]

    # Save all the results to a CSV file
    csv_file_path = "set_results.csv"
    with open(csv_file_path, 'w', newline='', encoding='utf-8') as csv_file:
        fieldnames = ['Id', 'Text']
        writer = csv.DictWriter(csv_file, fieldnames=fieldnames)

        # Write header
        writer.writeheader()

        # Write data
        for result in all_results:
            writer.writerow(result)

print(f"Results have been written to {csv_file_path}")


In [None]:
import csv
import uuid
from googletrans import Translator

# Function to process a batch of transcripts
def process_batch(transcripts, prompt_template):
    results = []

    for transcript_key, transcript_text in transcripts.items():
        # Create an LLM "prompt" for the current transcript
        prompt = prompt_template.format(transcripts=transcript_text)

        try:
            # Use LLM model to extract information.
            result = pg.Completion.create(
                model="Nous-Hermes-Llama2-13B",
                prompt=prompt,
                max_tokens=200  # Adjust max_tokens as needed
            )

            # Extract specific information from the result
            text_info = result['choices'][0]['text']

            # Add English translation
            translator = Translator()
            english_translation = translator.translate(text_info, src='auto', dest='en').text

            # Split the text into rows
            rows = english_translation.split('\n')

            # Append each row as a dictionary to the results list
            for row in rows:
                # Generate a unique ID for each row using uuid
                row_id = str(uuid.uuid4())
                # Extract information content
                row_content = row.strip()
                results.append({'Id': row_id, 'Text': row_content})

            # Print progress
            print(f"Processed transcript {transcript_key}")

        except Exception as e:
            # Print the exception for debugging
            print(f"Error processing transcript {transcript_key}: {e}")

            # Write None when there is no prediction
            row_id = str(uuid.uuid4())
            results.append({'Id': row_id, 'Text': 'None'})

    return results



# Select the first 100 transcripts for testing
subset_transcripts = {key: transcripts[key] for key in list(transcripts.keys())[:4917]}

# Define the LLM prompt template
prompt_template = """### Instruction:
Your task is to parse a JSON containing the medical transcript. This should consist of the following information:
Patient's Name: Look for any mention of the patient's name in the provided text in English.
Patient's Age: Extract information about the age of the patient if available in English.
Medical Condition: Identify and extract details about the patient's medical condition or health issue in English.
Symptoms: Find and list any symptoms mentioned by the patient in English.
Precautions: Look for recommendations or precautions given to the patient for managing their health in English.
Medication/Drug: Identify and extract information about any medication or drug mentioned in the text in English.

### Input:
{transcripts}

### Response:
"""

# Initialize an empty list to store results
all_results = []

# Process the transcripts one by one
for transcript in subset_transcripts:
    result = process_batch({transcript: subset_transcripts[transcript]}, prompt_template)
    all_results.extend(result)

# Save all the results to a CSV file
csv_file_path = "set_results.csv"
with open(csv_file_path, 'w', newline='', encoding='utf-8') as csv_file:
    fieldnames = ['Id', 'Text']
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)

    # Write header
    writer.writeheader()

    # Write data
    for result in all_results:
        writer.writerow(result)

print(f"Results have been written to {csv_file_path}")


In [None]:
from google.colab import files

# Provide the filename
filename = 'set_results.csv'

# Download the file
files.download(filename)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>