In [1]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    GenerationConfig,
    LlamaForCausalLM,
    LlamaTokenizer,
    pipeline,
)
import os
import logging
import torch
from datasets import load_dataset
from evaluate import evaluator
from datasets import Dataset
from evaluate import load

os.environ['HUGGINGFACEHUB_API_TOKEN'] = "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"
!huggingface-cli login --token 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

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [2]:
model_name = "meta-llama/Llama-2-7b-chat-hf"
# model_name = "atom92/medical_lama_2_all"
# model_name = "atom92/medical_lama_ultra"
# model_name = "atom92/medical-token-llama-2-healthwa-3"

# dataset_name = "atom92/medical_healthwa_all_2.0"
dataset_name = "atom92/medical_healthwa_2.0"

eval_set_size = 0.1
device_map = {"": 0}

In [3]:
my_dataset = load_dataset(dataset_name, split="train")
splitted_dataset = my_dataset.train_test_split(test_size=eval_set_size, shuffle=True, seed=42)
eval_dataset = splitted_dataset["test"].shuffle(seed=42).select(range(500))

In [4]:
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
logging.info("Tokenizer loaded")

In [5]:
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map=device_map,
    torch_dtype=torch.bfloat16,
    low_cpu_mem_usage=True,
    trust_remote_code=True,
    max_memory={0: "18GiB", 1: "18GiB"},
    offload_folder="/tmp/offload"
)
model.tie_weights()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=254,
    do_sample=True,
    temperature=0.9,
    top_p=0.5,
    top_k=50,
    repetition_penalty=1.1,
)

In [7]:
task_evaluator = evaluator("text-generation")

In [8]:
def extract_question_and_context(text):
    # Assuming the structure is <START_Q> Question <END_Q><START_A> Answer <END_A>
    split_text = text.split("<END_Q>")
    question = split_text[0].replace("<START_Q>", "").strip()
    answer = split_text[1].replace("<START_A>", "").split("<END_A>")[0].strip()
    return question, answer

In [9]:
for a in eval_dataset:
    print(len(a["text"]))

202
188
219
291
228
178
215
152
166
190
148
190
170
194
176
258
189
294
203
231
237
204
223
152
267
140
156
173
271
221
221
148
251
167
268
168
291
238
323
200
184
262
231
228
258
189
228
151
201
196
227
219
211
307
241
201
153
148
153
296
193
260
273
133
259
189
230
320
206
297
168
137
197
204
152
122
161
181
199
262
183
189
162
190
328
174
209
168
170
257
234
197
190
234
158
243
180
223
214
193
164
147
221
206
122
282
194
141
212
286
186
212
218
234
144
161
200
119
219
254
234
188
250
157
201
115
186
184
167
133
218
245
176
192
229
175
300
206
206
203
207
180
297
252
231
270
112
203
230
219
224
161
224
99
242
226
218
180
203
264
181
123
145
210
168
182
212
204
257
158
173
214
178
219
190
220
269
158
307
215
221
244
123
115
159
242
176
208
302
198
247
258
231
252
204
148
212
242
234
192
221
237
256
274
191
192
185
286
202
293
228
147
204
231
255
250
225
245
226
258
154
186
207
215
235
182
197
168
250
190
116
210
352
191
178
302
221
195
165
282
137
248
194
183
230
202
242
138
179
160
3

In [10]:
predictions = []
references = []
i = 0
for example in eval_dataset:
    i += 1
    text = example["text"]
    if len(text) > 4000:
        continue
    question, answer = extract_question_and_context(text)
    output = pipe(question)[0]['generated_text']
    predictions.append(output)
    references.append(answer)
    print(i)

1
2
3
4
5
6
7
8
9
10




11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
2

In [38]:
with open('eval/small/predictions.txt', 'w', encoding='utf-8') as file:
    for string in predictions:
        file.write(string + '\n%%%%%%\n')
with open('eval/small/references.txt', 'w', encoding='utf-8') as file:
    for string in references:
        file.write(string + '\n%%%%%%\n')

In [39]:
def read_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
        entries = content.strip().split('\n%%%%%%\n')
    return entries

In [40]:
predictions1 = read_data('eval/original/predictions.txt')
references1 = read_data('eval/original/references.txt')

In [41]:
len(references1)

500

In [42]:
len(predictions1)

500

In [51]:
bleu = load("bleu")
results = bleu.compute(predictions=predictions, references=references)
print(results)

{'bleu': 0.024399983142780034, 'precisions': [0.07481244483671669, 0.030874195695584646, 0.016343150379294957, 0.009389724029616334], 'brevity_penalty': 1.0, 'length_ratio': 8.16503017746149, 'translation_length': 90640, 'reference_length': 11101}


In [52]:
rouge = load('rouge')
results_rouge = rouge.compute(predictions=predictions, references=references)
print(results_rouge)

{'rouge1': 0.14482519792654786, 'rouge2': 0.0645320223256749, 'rougeL': 0.11697601685379733, 'rougeLsum': 0.12521094367240043}


In [53]:
bertscore = load("bertscore")
results_bertscore = bertscore.compute(predictions=predictions, references=references, lang="en")
print(results_bertscore)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'precision': [0.8043017387390137, 0.8288896083831787, 0.8414512276649475, 0.840823769569397, 0.816332221031189, 0.8001759648323059, 0.8183832764625549, 0.791878342628479, 0.8111646175384521, 0.8161835670471191, 0.795813262462616, 0.8069126009941101, 0.7928669452667236, 0.8365243077278137, 0.8197404146194458, 0.8402665853500366, 0.8180775046348572, 0.8284139633178711, 0.8163959383964539, 0.8202522993087769, 0.8000416159629822, 0.8155336976051331, 0.8245928287506104, 0.8210825324058533, 0.8383650779724121, 0.7941462397575378, 0.8109501600265503, 0.8292733430862427, 0.8309017419815063, 0.8396703004837036, 0.8177458047866821, 0.8227636814117432, 0.8296363353729248, 0.8139446973800659, 0.8316292762756348, 0.8316047787666321, 0.8144471049308777, 0.8373436331748962, 0.8201019763946533, 0.8225778341293335, 0.816748857498169, 0.8163303136825562, 0.8194510340690613, 0.8258833885192871, 0.7979540824890137, 0.8389042615890503, 0.8107413053512573, 0.8044999837875366, 0.8312932848930359, 0.81612432

In [54]:
arr = results_bertscore["precision"]
print(sum(arr)/len(arr))

0.819754429936409


In [55]:
meteor_metric = load("meteor")
results_meteor = meteor_metric.compute(predictions=predictions, references=references)
print(results_meteor)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


{'meteor': 0.24556055593594978}


In [56]:
frugalscore = load("frugalscore")
results_frugalscore = frugalscore.compute(predictions=predictions, references=references)
print(results_frugalscore)

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'scores': [0.48242188, 0.6875, 0.61621094, 0.64746094, 0.56347656, 0.4765625, 0.5732422, 0.49243164, 0.5673828, 0.6020508, 0.5385742, 0.515625, 0.4802246, 0.60058594, 0.6118164, 0.6303711, 0.6142578, 0.5473633, 0.55322266, 0.59716797, 0.52001953, 0.6464844, 0.5756836, 0.5004883, 0.66503906, 0.5415039, 0.5288086, 0.6069336, 0.609375, 0.50390625, 0.50683594, 0.51171875, 0.5727539, 0.50097656, 0.61865234, 0.62646484, 0.64941406, 0.6333008, 0.4831543, 0.6069336, 0.52001953, 0.5678711, 0.6044922, 0.6386719, 0.4609375, 0.65722656, 0.5048828, 0.54785156, 0.6191406, 0.5620117, 0.63916016, 0.61816406, 0.5683594, 0.66748047, 0.47558594, 0.51904297, 0.46240234, 0.57910156, 0.6113281, 0.66845703, 0.5917969, 0.5888672, 0.54833984, 0.5209961, 0.64941406, 0.5390625, 0.57958984, 0.5620117, 0.6269531, 0.56689453, 0.5942383, 0.5234375, 0.6430664, 0.5761719, 0.5629883, 0.39648438, 0.5029297, 0.69189453, 0.5551758, 0.63183594, 0.59277344, 0.5991211, 0.6274414, 0.45629883, 0.64990234, 0.6123047, 0.4816894

In [None]:
arr = results_frugalscore["scores"]
print(sum(arr)/len(arr))

In [13]:
!nvidia-smi

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Tue Dec 19 22:34:48 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.113.01             Driver Version: 535.113.01   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 4090        On  | 00000000:81:00.0 Off |                  Off |
| 30%   33C    P2              71W / 450W |  14785MiB / 24564MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [13]:
predictions

['Where can individuals seek help for anxiety-related issues?\nA: There are several resources available to individuals seeking help for anxiety-related issues. Here are some options:\n\n1. Mental Health Professionals: Psychologists, psychiatrists, and therapists are trained to diagnose and treat anxiety disorders. They can provide individual or group therapy, cognitive-behavioral therapy (CBT), and other forms of treatment. You can find mental health professionals in your area through your insurance provider or by searching online.\n2. Counseling Centers: Many colleges and universities have counseling centers that offer free or low-cost therapy sessions. These centers often have experienced therapists who specialize in treating anxiety disorders.\n3. Community Mental Health Centers: These centers offer affordable therapy sessions and may have sliding scale fees based on income. They also may offer group therapy and other forms of treatment.\n4. Online Therapy Platforms: There are many 

In [19]:
with open('eval/original/prediction.txt', 'w', encoding='utf-8') as file:
    for string in predictions:
        file.write(string + '\n')

In [20]:
with open('eval/original/prediction.txt', 'w', encoding='utf-8') as file:
    for string in predictions:
        file.write(string + '\n%%%%%%')
with open('eval/original/references.txt', 'w', encoding='utf-8') as file:
    for string in references:
        file.write(string + '\n%%%%%%')

In [24]:
!pip install bert_score

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m579.6 kB/s[0m eta [36m0:00:00[0m [36m0:00:01[0mm
Collecting transformers>=3.0.0
  Using cached transformers-4.36.2-py3-none-any.whl (8.2 MB)
Collecting matplotlib
  Downloading matplotlib-3.8.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (11.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.6/11.6 MB[0m [31m30.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting pandas>=1.0.1
  Using cached pandas-2.1.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.3 MB)
Collecting tzdata>=2022.1
  Using cached tzdata-2023.4-py2.py3-none-any.whl (346 kB)
Collecting safetensors>=0.3.1
  Using cached safetensors-0.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
Collecting regex!=2019.12.17
  Using cached regex-2023.12.25-cp310-cp310-manylinux_2_17_x86

In [26]:
!pip install bert_score

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[0m

In [29]:
with open('eval/original/references.txt', 'r', encoding='utf-8') as file:
    # Read each line and store it in a list
    a = [line.strip() for line in file]

In [30]:
a

['Individuals can seek help from their doctor, healthdirect, or the Mental Health Emergency Response Line (MHERL).</s>',
 "Mpox is not typically transmitted through urine, but close contact with an infected person's urine can pose a risk.",
 'The primary mode of transmission of rabies from infected animals to humans is through bites or scratches.</s>',
 "Doctors do not recommend using vaporizers in a child's room to help with croup symptoms, as there is no evidence that they are effective, and there is a risk of burns from the steam.</s>",
 'Lifestyle modifications may include reducing salt intake, regular exercise, and maintaining a healthy weight.</s>',
 'Parents can use paracetamol as directed to help alleviate discomfort.</s>',
 'A doctor can provide treatment and guidance for Salmonella infection, especially in cases of severe or prolonged symptoms.</s>',
 'Raw meats can contain STEC bacteria, increasing the risk of infection.</s>',
 'Normally, the body and vaginal bacteria preven

In [31]:
references

['Individuals can seek help from their doctor, healthdirect, or the Mental Health Emergency Response Line (MHERL).</s>',
 "Mpox is not typically transmitted through urine, but close contact with an infected person's urine can pose a risk.",
 'The primary mode of transmission of rabies from infected animals to humans is through bites or scratches.</s>',
 "Doctors do not recommend using vaporizers in a child's room to help with croup symptoms, as there is no evidence that they are effective, and there is a risk of burns from the steam.</s>",
 'Lifestyle modifications may include reducing salt intake, regular exercise, and maintaining a healthy weight.</s>',
 'Parents can use paracetamol as directed to help alleviate discomfort.</s>',
 'A doctor can provide treatment and guidance for Salmonella infection, especially in cases of severe or prolonged symptoms.</s>',
 'Raw meats can contain STEC bacteria, increasing the risk of infection.</s>',
 'Normally, the body and vaginal bacteria preven