### Imports

In [1]:
from definitions import *

### Make quiz prompt

In [2]:
with open("../../metrics/knowledge_tests/obrazovaka/oblomov.json", "r") as f:
    quiz = json.load(f)

In [3]:
part = quiz[0]
part

{'question': 'К какому литературному направлению относится роман Гончарова «Обломов»?',
 'answers': [{'answer': 'Романтизм;', 'is_correct': 0},
  {'answer': 'Реализм;', 'is_correct': 1},
  {'answer': 'Классицизм;', 'is_correct': 0},
  {'answer': 'Сентиментализм.', 'is_correct': 0}]}

In [4]:
def build_prompt(quiz_part):
    processed_answers = ""
    for i, answer in enumerate(quiz_part["answers"]):
        processed_answers += f"{i + 1}. {answer['answer']}\n"
    # return f"Выбери один верный вариант из предложенных.\nВопрос: {quiz_part['question']}\nВарианты ответа:\n{processed_answers}"
    return f"Выбери один верный вариант из предложенных. В качестве ответа напиши только номер, без дополнительного текста. Например: '1', '2', '3', '4'.\nВопрос: {quiz_part['question']}\nВарианты ответа:\n{processed_answers}"

In [5]:
print(build_prompt(part))

Выбери один верный вариант из предложенных. В качестве ответа напиши только номер, без дополнительного текста. Например: '1', '2', '3', '4'.
Вопрос: К какому литературному направлению относится роман Гончарова «Обломов»?
Варианты ответа:
1. Романтизм;
2. Реализм;
3. Классицизм;
4. Сентиментализм.



### Learn to make requests to ollama

In [47]:
! ollama list

NAME                    	ID          	SIZE  	MODIFIED     
dolphin-phi:latest      	c5761fc77240	1.6 GB	14 hours ago	
gemma:2b                	b50d6c999e59	1.7 GB	7 hours ago 	
gemma:7b                	a72c7f4d0a15	5.0 GB	7 hours ago 	
llama2:13b              	d475bf4c50bc	7.4 GB	7 hours ago 	
llama2:70b              	e7f6c06ffef4	38 GB 	5 hours ago 	
llama2:latest           	78e26419b446	3.8 GB	14 hours ago	
llama2-uncensored:latest	44040b922233	3.8 GB	13 hours ago	
llama3:70b              	bcfb190ca3a7	39 GB 	6 hours ago 	
llama3:latest           	71a106a91016	4.7 GB	16 hours ago	
llava:latest            	8dd30f6b0cb1	4.7 GB	7 hours ago 	
mistral:latest          	61e88e884507	4.1 GB	14 hours ago	
neural-chat:latest      	89fa737d3b85	4.1 GB	14 hours ago	
orca-mini:latest        	2dbd9f439647	2.0 GB	7 hours ago 	
phi:latest              	e2fd6321a5fe	1.6 GB	14 hours ago	
solar:latest            	059fdabbe6e6	6.1 GB	7 hours ago 	
starling-lm:latest      	39153f619be6	4.1 GB	14 hours ag

In [13]:
def get_model_response(model, prompt):
    headers = {
        'Content-Type': 'application/json',
    }
    json_data = {
        'model': model,
        'prompt': prompt,
        'stream': False,
        "options": {
            "seed": 123,
            "temperature": 0
        }
    }

    response = requests.post('http://localhost:11434/api/generate', headers=headers, json=json_data)
    payload = json.loads(response.content)
    return payload["response"]

In [48]:
get_model_response("llama3", "say hi")

"Hi! How's it going?"

### Try to pass quiz with a model

In [16]:
prompt = build_prompt(part)
print(prompt)
get_model_response(model, prompt)

Выбери один верный вариант из предложенных. В качестве ответа напиши только номер, без дополнительного текста. Например: '1', '2', '3', '4'.
Вопрос: В чем автор видит причины «обломовщины»?
Варианты ответа:
1. В деградации и застое чистой, нерасчетливой, нежной, но ленивой души героя;
2. В окружении героя, которое не давало ему развиваться;
3. В недостаточно хорошем образовании героя;
4. В отсутствии достойных стимулов к развитию.



'1.'

In [17]:
len(quiz)

19

In [49]:
def eval_model_on_quiz(model, quiz):
    num_ok = 0
    num_all = 0
    for part in tqdm(quiz):
        prompt = build_prompt(part)
        answer = get_model_response(model, prompt)
        first_part = answer.split()[0]
        if first_part.endswith("."):
            first_part = first_part[:-1]
        ok_answer = -1
        for i, x in enumerate(part["answers"]):
            if x["is_correct"] == 1:
                ok_answer = i + 1
        try:
            first_part = int(first_part)
        except Exception:
            print(f"Cannot convert model first part answer to int: '{answer}'")
        if ok_answer == first_part:
            num_ok += 1
        num_all += 1
    print(f"result is {num_ok} out of {num_all}. {num_ok / num_all * 100:.2f}%")
    return num_ok, num_all

In [50]:
model = "llama3"
tests_dir = Path("../../metrics/knowledge_tests/obrazovaka/")
rows = []
for test_path in tests_dir.glob("*.json"):
    with open(test_path, "r") as f:
        quiz = json.load(f)
    num_ok, num_all = eval_model_on_quiz(model, quiz)
    rows.append({
        "model": model,
        "test_path": test_path,
        "num_ok": num_ok,
        "num_all": num_all,
        "percent": float(f"{(num_ok / num_all * 100):.2f}"),
    })

100%|██████████| 16/16 [00:08<00:00,  1.82it/s]


result is 3 out of 16. 18.75%


100%|██████████| 10/10 [00:03<00:00,  2.71it/s]


result is 5 out of 10. 50.00%


100%|██████████| 18/18 [00:08<00:00,  2.02it/s]


result is 6 out of 18. 33.33%


100%|██████████| 19/19 [00:10<00:00,  1.80it/s]


result is 12 out of 19. 63.16%


100%|██████████| 10/10 [00:03<00:00,  2.72it/s]


result is 2 out of 10. 20.00%


100%|██████████| 13/13 [00:05<00:00,  2.39it/s]


result is 2 out of 13. 15.38%


100%|██████████| 70/70 [00:35<00:00,  1.96it/s]


result is 38 out of 70. 54.29%


100%|██████████| 16/16 [00:07<00:00,  2.09it/s]


result is 5 out of 16. 31.25%


100%|██████████| 19/19 [00:08<00:00,  2.24it/s]


result is 9 out of 19. 47.37%


100%|██████████| 10/10 [00:03<00:00,  2.71it/s]


result is 7 out of 10. 70.00%


100%|██████████| 10/10 [00:03<00:00,  2.82it/s]


result is 2 out of 10. 20.00%


100%|██████████| 30/30 [00:15<00:00,  1.89it/s]


result is 12 out of 30. 40.00%


100%|██████████| 12/12 [00:05<00:00,  2.16it/s]


result is 4 out of 12. 33.33%


100%|██████████| 16/16 [00:08<00:00,  1.91it/s]


result is 7 out of 16. 43.75%


100%|██████████| 10/10 [00:04<00:00,  2.47it/s]

result is 4 out of 10. 40.00%





In [43]:
# for row in rows:
    # row["percent"] = float(f'{(row["num_ok"] / row["num_all"] * 100):.2f}')

In [51]:
df = pd.DataFrame(rows)
df

Unnamed: 0,model,test_path,num_ok,num_all,percent
0,llama3,../../metrics/knowledge_tests/obrazovaka/the_d...,3,16,18.75
1,llama3,../../metrics/knowledge_tests/obrazovaka/the_b...,5,10,50.0
2,llama3,../../metrics/knowledge_tests/obrazovaka/and_q...,6,18,33.33
3,llama3,../../metrics/knowledge_tests/obrazovaka/oblom...,12,19,63.16
4,llama3,../../metrics/knowledge_tests/obrazovaka/eveni...,2,10,20.0
5,llama3,../../metrics/knowledge_tests/obrazovaka/docto...,2,13,15.38
6,llama3,../../metrics/knowledge_tests/obrazovaka/war_a...,38,70,54.29
7,llama3,../../metrics/knowledge_tests/obrazovaka/the_m...,5,16,31.25
8,llama3,../../metrics/knowledge_tests/obrazovaka/dead_...,9,19,47.37
9,llama3,../../metrics/knowledge_tests/obrazovaka/the_g...,7,10,70.0


In [52]:
init_df = pd.read_csv("../../metrics/knowledge_tests/obrazovaka/full_results.csv")
full_df = pd.concat((init_df, df))
full_df

Unnamed: 0,model,test_path,num_ok,num_all,percent
0,llama2,../../metrics/knowledge_tests/obrazovaka/the_d...,3,16,18.75
1,llama2,../../metrics/knowledge_tests/obrazovaka/the_b...,1,10,10.0
2,llama2,../../metrics/knowledge_tests/obrazovaka/and_q...,6,18,33.33
3,llama2,../../metrics/knowledge_tests/obrazovaka/oblom...,7,19,36.84
4,llama2,../../metrics/knowledge_tests/obrazovaka/eveni...,1,10,10.0
5,llama2,../../metrics/knowledge_tests/obrazovaka/docto...,3,13,23.08
6,llama2,../../metrics/knowledge_tests/obrazovaka/war_a...,17,70,24.29
7,llama2,../../metrics/knowledge_tests/obrazovaka/the_m...,4,16,25.0
8,llama2,../../metrics/knowledge_tests/obrazovaka/dead_...,4,19,21.05
9,llama2,../../metrics/knowledge_tests/obrazovaka/the_g...,2,10,20.0


In [53]:
full_df.to_csv("../../metrics/knowledge_tests/obrazovaka/full_results.csv", index=False)