In [1]:
import pickle
import pandas as pd

In [2]:
import os
from tqdm import tqdm

In [3]:
ROOT = "/gpfs/space/projects/stud_ml_22/NLP"
PATH_TO_CONVERTED_TOKENIZER = os.path.join(ROOT, "llama/7B_converted/")

In [4]:
with open(os.path.join(ROOT, "data/course_questions.pkl"), 'rb') as f:
    data = pickle.load(f, encoding='utf8')
data

Unnamed: 0,course_code,question,answer
0,OIEO.06.046,What is the title of the course OIEO.06.046?,The title of the course OIEO.06.046 is Private...
1,OIEO.06.046,What is the name of the course OIEO.06.046?,The name of the course OIEO.06.046 is Private ...
2,OIEO.06.046,What is the code of the course Private Interna...,The code for the course Private International ...
3,OIEO.06.046,How is the course OIEO.06.046 called?,The course OIEO.06.046 is called Private Inter...
4,OIEO.06.046,How many credits does the course OIEO.06.046 h...,The course OIEO.06.046 has 6 credits.
...,...,...,...
44,SVNC.00.273,Is SVNC.00.273 offered for bachelor's studies?,"Yes, SVNC.00.273 is offered for bachelor's stu..."
46,SVNC.00.273,Is SVNC.00.273 offered for master's studies?,"Yes, SVNC.00.273 is offered for master's studies."
48,SVNC.00.273,Is SVNC.00.273 offered for doctoral studies?,"No, SVNC.00.273 is not offered for doctoral st..."
50,SVNC.00.273,Is SVNC.00.273 offered for integrated bachelor...,"No, SVNC.00.273 is not offered for integrated ..."


In [5]:
data = data.reset_index()

In [7]:
val_data = data.sample(frac=0.05, random_state=42)
val_data.head()

Unnamed: 0,index,course_code,question,answer
19836,8,LOFY.05.051,How many credits does the course Master's Cour...,The course Master's Course in Biological Physi...
69651,13,P2PC.00.503,What is the structural unit of the course Fina...,The structural unit of the course Final Thesis...
101079,12,HVLC.03.030,What is the structural unit of the course HVLC...,The structural unit of the course HVLC.03.030 ...
43566,23,SHZU.01.015,Was the course Interviewing Techniques taught ...,The course Interviewing Techniques was tought ...
142557,3,HVLC.06.011,How is the course HVLC.06.011 called?,The course HVLC.06.011 is called Swedish for B...


In [14]:
len(train_data) / 8

17991.875

In [12]:
train_data = data.drop(val_data.index)
for i,r in train_data.iterrows():
    print(r['question'])
    print(r['answer'])
    
    if i > 200:
        break

What is the title of the course OIEO.06.046?
The title of the course OIEO.06.046 is Private International Law.
What is the name of the course OIEO.06.046?
The name of the course OIEO.06.046 is Private International Law.
What is the code of the course Private International Law?
The code for the course Private International Law is OIEO.06.046.
How is the course OIEO.06.046 called?
The course OIEO.06.046 is called Private International Law.
How many credits does the course OIEO.06.046 have?
The course OIEO.06.046 has 6 credits.
How many credits is the course OIEO.06.046 worth?
The course OIEO.06.046 is worth 6 credits.
How many credits is the course OIEO.06.046?
The course OIEO.06.046 is worth 6 credits.
How many credits can I get for the course OIEO.06.046?
You can get 6 credits for the course OIEO.06.046.
How many credits does the course Private International Law have?
The course Private International Law has 6 credits.
How many credits is the course Private International Law worth?
The

## Test dataset preparation from llama finetune tutorial

In [6]:
import torch
from torch.utils.data import IterableDataset
from transformers import AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
def chars_token_ratio(dataset, tokenizer, nb_examples=400):
    """
    Estimate the average number of characters per token in the dataset.
    """
    total_characters, total_tokens = 0, 0
    for _, example in tqdm(zip(range(nb_examples), dataset.iterrows()), total=nb_examples):
        text = prepare_sample_text(example[1])
        total_characters += len(text)
        if tokenizer.is_fast:
            total_tokens += len(tokenizer(text).tokens())
        else:
            total_tokens += len(tokenizer.tokenize(text))

    return total_characters / total_tokens


def prepare_sample_text(example):
    print(example)
    """Prepare the text from a sample of the dataset."""
    text = f"Question: {example['question']}\n\nAnswer: {example['answer']}"
    return text


class ConstantLengthDataset(IterableDataset):
    """
    Iterable dataset that returns constant length chunks of tokens from stream of text files.
        Args:
            tokenizer (Tokenizer): The processor used for proccessing the data.
            dataset (dataset.Dataset): Dataset with text files.
            infinite (bool): If True the iterator is reset after dataset reaches end else stops.
            seq_length (int): Length of token sequences to return.
            num_of_sequences (int): Number of token sequences to keep in buffer.
            chars_per_token (int): Number of characters per token used to estimate number of tokens in text buffer.
    """

    def __init__(
        self,
        tokenizer,
        dataset,
        infinite=False,
        seq_length=1024,
        num_of_sequences=1024,
        chars_per_token=3.6,
    ):
        self.tokenizer = tokenizer
        self.concat_token_id = tokenizer.eos_token_id if tokenizer.eos_token_id else args.eos_token_id
        self.dataset = dataset
        self.seq_length = seq_length
        self.infinite = infinite
        self.current_size = 0
        self.max_buffer_size = seq_length * chars_per_token * num_of_sequences

    def __iter__(self):
        iterator = self.dataset.iterrows()
        more_examples = True
        while more_examples:
            buffer, buffer_len = [], 0
            while True:
                if buffer_len >= self.max_buffer_size:
                    break
                try:
                    buffer.append(prepare_sample_text(next(iterator)[1]))
                    buffer_len += len(buffer[-1])
                except StopIteration:
                    if self.infinite:
                        iterator = iter(self.dataset)
                    else:
                        more_examples = False
                        break
            tokenized_inputs = self.tokenizer(buffer, truncation=False)["input_ids"]
            all_token_ids = []
            for tokenized_input in tokenized_inputs:
                all_token_ids.extend(tokenized_input + [self.concat_token_id])
            for i in range(0, len(all_token_ids), self.seq_length):
                input_ids = all_token_ids[i : i + self.seq_length]
                if len(input_ids) == self.seq_length:
                    self.current_size += 1
                    yield {
                        "input_ids": torch.LongTensor(input_ids),
                        "labels": torch.LongTensor(input_ids),
                    }


In [8]:
tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)

In [11]:
dataset = ConstantLengthDataset(tokenizer, data[:100], infinite=False)

In [12]:
for i in dataset:
    print(i)

index                                                          0
course_code                                          OIEO.06.046
question            What is the title of the course OIEO.06.046?
answer         The title of the course OIEO.06.046 is Private...
Name: 0, dtype: object
index                                                          1
course_code                                          OIEO.06.046
question             What is the name of the course OIEO.06.046?
answer         The name of the course OIEO.06.046 is Private ...
Name: 1, dtype: object
index                                                          2
course_code                                          OIEO.06.046
question       What is the code of the course Private Interna...
answer         The code for the course Private International ...
Name: 2, dtype: object
index                                                          3
course_code                                          OIEO.06.046
question             

In [32]:
chars_token_ratio(data[:100], tokenizer)

 25%|██▌       | 100/400 [00:00<00:00, 526.88it/s]

index                                                          0
course_code                                          OIEO.06.046
question            What is the title of the course OIEO.06.046?
answer         The title of the course OIEO.06.046 is Private...
Name: 0, dtype: object
index                                                          1
course_code                                          OIEO.06.046
question             What is the name of the course OIEO.06.046?
answer         The name of the course OIEO.06.046 is Private ...
Name: 1, dtype: object
index                                                          2
course_code                                          OIEO.06.046
question       What is the code of the course Private Interna...
answer         The code for the course Private International ...
Name: 2, dtype: object
index                                                          3
course_code                                          OIEO.06.046
question             




3.257748776508972