In [1]:
import datasets
import openai
import pandas as pd
import time
import os
from datasets import Dataset
from openai import OpenAI
from tqdm import tqdm

In [2]:
hf_token = os.environ.get("HUGGING_FACE_TOKEN")
openai_api_key = os.environ.get("OPENAI_API_KEY")

client = OpenAI()

In [3]:
evaluation_data  = datasets.load_dataset("israel/JOPUjJHxWmI5xXf", use_auth_token=hf_token)



In [4]:
def query_gpt4(instruction, input_text):
    response = client.chat.completions.create(
        model="gpt-4",
        messages=[
        {"role": "user", "content": f"{instruction}\n{input_text}"}
        ]
    )
    return response.choices[0].message.content.strip()

In [5]:
def save_to_csv(data, filename):
    df = pd.DataFrame(data, columns=['instruction', 'input', 'output', 'datasource', 'response'])
    df.to_csv(filename, index=False)

In [6]:
def evaluate(test_data, data_sources, output_dir='model_evaluation', sleep_duration=2, resume=False):
    os.makedirs(output_dir, exist_ok=True)
    data = []
    start_index = 0
    output_filename = os.path.join(output_dir, "gpt4_responses.csv")
    
    
    if resume and os.path.exists(output_filename):
        df = pd.read_csv(output_filename)
        start_index = len(df)
        data.extend(df.values.tolist())
        print(f"Resuming from Test Case {start_index + 1}")
        
    datasource_data = test_data.filter(lambda example: example['datasource'] in data_sources)
    
    resume_data = datasource_data[start_index:]   
    
    for i, (
        instruction,
        input_text,
        output_text,
        data_source
    ) in enumerate(
        tqdm(
            zip(
                resume_data['instruction'],
                resume_data['input'],
                resume_data['output'],
                resume_data['datasource']
            ),
            total=len(resume_data['input'])
        )
    ): 
  
        try:
            response = query_gpt4(instruction, input_text)
            data.append([instruction, input_text, output_text, data_source, response])
            save_to_csv(data, output_filename)
        except Exception as e:
            print(f"API Error for {data_source} - Test Case {start_index + i + 1}: {str(e)}")
            break
    
        time.sleep(sleep_duration)  # Add a sleep to avoid API rate limits
    
    
    print(f"Saved responses to {output_filename}")


In [7]:
# sample_test = Dataset.from_dict(evaluation_data['test'][:2])

In [8]:
evaluation_data['test'].unique('datasource')

['afrisent',
 'masakhanews',
 'masakhaner',
 'xlsum',
 'amharic_spellcheck',
 'amharic_poem',
 'amharic_zefen',
 'amharic_story_generation',
 'amharic_mezmur_completion',
 'amharic_mezmur_generation',
 'amharic_mt']

In [12]:
evaluate(evaluation_data['test'], ['afrisent','masakhanews'], resume=False)

In [10]:
response_df = pd.read_csv('model_evaluation/gpt4_responses.csv')

In [11]:
response_df

Unnamed: 0,instruction,input,output,datasource,response
0,"የተሰጠው ጽሑፍ አስተያየት ምን አይነት ነው? ""አዎንታዊ""፣ ""አሉታዊ"" ወ...",'@user ክብር እና ምስጋና ለዓለማት ፈጣሪ ይሁን',አዎንታዊ,afrisent,"ይህ ጽሑፍ ""አዎንታዊ"" ማለት ነው. ምክንያቱም ሰውየው ያለውን ግለሰቦች ..."
1,"የተሰጠው ጽሑፍ አስተያየት ምን አይነት ነው? ""አዎንታዊ""፣ ""አሉታዊ"" ወ...",'ከህወሓት ጋር ድርድር ማለት ኢትዮጲያን ማፍረስ ዕቁብ መጣል ነው። #No...,አሉታዊ,afrisent,"ይህ አስተያየት ""አዎንታዊ"" ነው። ተናግሪው አስተያየቱ ማለት ህወሓት የኢ..."
2,"የተሰጠው ጽሑፍ አስተያየት ምን አይነት ነው? ""አዎንታዊ""፣ ""አሉታዊ"" ወ...",'እግዚአብሔር የተመሰገነ ይሁን ኢትዮጵያ አሸነፈች ። እንኳን ደስአለኝ ።',አዎንታዊ,afrisent,"ይህ ጽሑፍ ""አዎንታዊ"" አስተያየት ነው።"
