In [1]:
%pip install astrapy python-dotenv pandas datasets

Note: you may need to restart the kernel to use updated packages.


In [2]:
import json
from astrapy.db import AstraDBCollection, AstraDB
import os
from dotenv import load_dotenv
import pandas as pd
from datasets import Dataset
from pprint import pprint
import random
from IPython.display import clear_output
import time

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
load_dotenv()
# AstraDB connection information
token = os.getenv("token")
endpoint = os.getenv("endpoint")

collection_name = "test_instructions"
astra_db = AstraDB(token=token, api_endpoint=endpoint)
collection = AstraDBCollection(collection_name=collection_name, astra_db=astra_db)

In [4]:
nextPageState = ""
raw_dataset = []
expected_columns = ['_id','instruction', 'input', 'output', 'original_llm_response', 'fine_tuned_llm_response']

def check_expected_columns(raw_instruction):
  if all(column in raw_instruction for column in expected_columns):
    return True
  else:
    return False

while nextPageState != None:
  if nextPageState == "":
    data = collection.find()
    nextPageState = data['data']['nextPageState']
    raw_instructions = [instruction for instruction in data['data']['documents'] if check_expected_columns(instruction)]
    raw_dataset.extend(raw_instructions)
  else:
    data = collection.find(options={"pageState":nextPageState}, sort = None)
    nextPageState = data['data']['nextPageState']
    raw_instructions = [instruction for instruction in data['data']['documents'] if check_expected_columns(instruction)]
    raw_dataset.extend(raw_instructions)

dataframe = pd.DataFrame(data=raw_dataset, dtype='string')
#dataframe.info()
dataset = Dataset.from_pandas(dataframe)
#pprint(dataset[0])

idx_min = 0
idx_max = 100
partial_dataset = dataset.filter(lambda example, idx: idx >= idx_min and idx < idx_max, with_indices=True)
shuffled_dataset = partial_dataset.shuffle().flatten_indices()

Filter: 100%|██████████| 95/95 [00:00<00:00, 20759.55 examples/s]
Flattening the indices: 100%|██████████| 95/95 [00:00<00:00, 18417.33 examples/s]


In [5]:
def split_long_string(string, width):
    if len(string) > width:
        return [string[0:width]]+split_long_string(string[width:], width)
    else:
        return [string]
        
def print_two_columns(string1, string2, width):
    i = 0
    list1 = [ split_long_string(line, width) for line in string1.split("\n") ]
    list2 = [ split_long_string(line, width) for line in string2.split("\n") ]
    lines1 = [item for row in list1 for item in row]
    lines2 = [item for row in list2 for item in row]
    max_len = max(len(lines1), len(lines2))
    table = []
    padding = 10
    col_width = width + padding
    for i in range(max_len):
        column1 = ""
        column2 = ""
        if i < len(lines1):
            column1 = lines1[i]
        if i < len(lines2):
            column2 = lines2[i]
        row = [column1, column2]
        table.append(row)
    for row in table:
        format_string = "{:>"+str(col_width)+"} {:>"+str(col_width)+"}"
        print(format_string.format(*row))



In [6]:
user_name = "oby"

In [8]:
eval_responses = []

for row in shuffled_dataset:
    order = random.randint(0,1)
    if order == 0:
        responses = ["Response #1: \n"+row['original_llm_response'], "Response #2: \n"+row['fine_tuned_llm_response']]
    else:
        responses = ["Response #1: \n"+row['fine_tuned_llm_response'], "Response #2: \n"+row['original_llm_response']]
    print("Rate each of the following responses to the input prompt out of ten.")
    print("Instruction: ")
    pprint(row['instruction'])
    print("Context:")
    pprint(row['input'])
    print("\n")
    print_two_columns(responses[0], responses[1], 70)
    time.sleep(1)
    row_result = input("Please enter your ratings as two numbers separated by a comma.")
    if "," in row_result:
        row_result = [int(rating) for rating in row_result.split(",")]
    if "." in row_result:
        row_result = [int(rating) for rating in row_result.split(".")]
    if " " in row_result:
        row_result = [int(rating) for rating in row_result.split(" ")]
    original_rating = 0
    fine_tuned_rating = 0
    if order == 0:
        original_rating = row_result[0]
        fine_tuned_rating = row_result[1]
    else:
        original_rating = row_result[1]
        fine_tuned_rating = row_result[0]

    output_line = {
        "_id": row['_id'],
        "original_rating": original_rating,
        "fine_tuned_rating": fine_tuned_rating
    }

    eval_responses.append(output_line)
    clear_output(wait=True)

with open(user_name+'_eval_out_of_ten.json', 'w', encoding='utf-8') as f:
    json.dump(eval_responses, f, ensure_ascii=False, indent=4)

    

Rate each of the following responses to the input prompt out of ten.
Instruction: 
('Explain the features of NoSQL technologies, particularly focusing on '
 "Cassandra's architecture and benefits.")
Context:
('NoSQL databases like Apache Cassandra are optimized for modern data '
 'applications that require large data volume, low latency, and flexible data '
 'models. Cassandra is an obvious choice with its high throughput and ability '
 'to support globally distributed and always-on apps. In addition, Apache '
 'Pulsar is highlighted as an advanced, open-source streaming and messaging '
 'technology ideal for handling real-time data. Finally, Stargate, an '
 'open-source data API layer, empowers developers to build apps with freedom '
 'of choice and without operational distractions.')


                                                                   Response #1:                                                                     Response #2: 
                                       

IndexError: string index out of range