In [4]:
import pickle
import pandas as pd

In [5]:
import os
from tqdm import tqdm
import matplotlib.pyplot as plt

In [6]:
ROOT = "/gpfs/space/projects/stud_ml_22/NLP"
PATH_TO_CONVERTED_TOKENIZER = os.path.join(ROOT, "llama/7B_converted/")

In [4]:
with open(os.path.join(ROOT, "data/course_questions.pkl"), 'rb') as f:
    data = pickle.load(f, encoding='utf8')
data

Unnamed: 0,course_code,question,answer
0,OIEO.06.046,What is the title of the course OIEO.06.046?,The title of the course OIEO.06.046 is Private...
1,OIEO.06.046,What is the name of the course OIEO.06.046?,The name of the course OIEO.06.046 is Private ...
2,OIEO.06.046,What is the code of the course Private Interna...,The code for the course Private International ...
3,OIEO.06.046,How is the course OIEO.06.046 called?,The course OIEO.06.046 is called Private Inter...
4,OIEO.06.046,How many credits does the course OIEO.06.046 h...,The course OIEO.06.046 has 6 credits.
...,...,...,...
44,SVNC.00.273,Is SVNC.00.273 offered for bachelor's studies?,"Yes, SVNC.00.273 is offered for bachelor's stu..."
46,SVNC.00.273,Is SVNC.00.273 offered for master's studies?,"Yes, SVNC.00.273 is offered for master's studies."
48,SVNC.00.273,Is SVNC.00.273 offered for doctoral studies?,"No, SVNC.00.273 is not offered for doctoral st..."
50,SVNC.00.273,Is SVNC.00.273 offered for integrated bachelor...,"No, SVNC.00.273 is not offered for integrated ..."


In [5]:
data = data.reset_index()

In [7]:
val_data = data.sample(frac=0.05, random_state=42)
val_data.head()

Unnamed: 0,index,course_code,question,answer
19836,8,LOFY.05.051,How many credits does the course Master's Cour...,The course Master's Course in Biological Physi...
69651,13,P2PC.00.503,What is the structural unit of the course Fina...,The structural unit of the course Final Thesis...
101079,12,HVLC.03.030,What is the structural unit of the course HVLC...,The structural unit of the course HVLC.03.030 ...
43566,23,SHZU.01.015,Was the course Interviewing Techniques taught ...,The course Interviewing Techniques was tought ...
142557,3,HVLC.06.011,How is the course HVLC.06.011 called?,The course HVLC.06.011 is called Swedish for B...


In [14]:
len(train_data) / 8

17991.875

In [12]:
train_data = data.drop(val_data.index)
for i,r in train_data.iterrows():
    print(r['question'])
    print(r['answer'])
    
    if i > 200:
        break

What is the title of the course OIEO.06.046?
The title of the course OIEO.06.046 is Private International Law.
What is the name of the course OIEO.06.046?
The name of the course OIEO.06.046 is Private International Law.
What is the code of the course Private International Law?
The code for the course Private International Law is OIEO.06.046.
How is the course OIEO.06.046 called?
The course OIEO.06.046 is called Private International Law.
How many credits does the course OIEO.06.046 have?
The course OIEO.06.046 has 6 credits.
How many credits is the course OIEO.06.046 worth?
The course OIEO.06.046 is worth 6 credits.
How many credits is the course OIEO.06.046?
The course OIEO.06.046 is worth 6 credits.
How many credits can I get for the course OIEO.06.046?
You can get 6 credits for the course OIEO.06.046.
How many credits does the course Private International Law have?
The course Private International Law has 6 credits.
How many credits is the course Private International Law worth?
The

## Test dataset preparation from llama finetune tutorial

In [4]:
import torch
from torch.utils.data import IterableDataset
from transformers import AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
def chars_token_ratio(dataset, tokenizer, nb_examples=400):
    """
    Estimate the average number of characters per token in the dataset.
    """
    total_characters, total_tokens = 0, 0
    for _, example in tqdm(zip(range(nb_examples), dataset.iterrows()), total=nb_examples):
        text = prepare_sample_text(example[1])
        total_characters += len(text)
        if tokenizer.is_fast:
            total_tokens += len(tokenizer(text).tokens())
        else:
            total_tokens += len(tokenizer.tokenize(text))

    return total_characters / total_tokens


def prepare_sample_text(example):
    print(example)
    """Prepare the text from a sample of the dataset."""
    text = f"Question: {example['question']}\n\nAnswer: {example['answer']}"
    return text


class ConstantLengthDataset(IterableDataset):
    """
    Iterable dataset that returns constant length chunks of tokens from stream of text files.
        Args:
            tokenizer (Tokenizer): The processor used for proccessing the data.
            dataset (dataset.Dataset): Dataset with text files.
            infinite (bool): If True the iterator is reset after dataset reaches end else stops.
            seq_length (int): Length of token sequences to return.
            num_of_sequences (int): Number of token sequences to keep in buffer.
            chars_per_token (int): Number of characters per token used to estimate number of tokens in text buffer.
    """

    def __init__(
        self,
        tokenizer,
        dataset,
        infinite=False,
        seq_length=1024,
        num_of_sequences=1024,
        chars_per_token=3.6,
    ):
        self.tokenizer = tokenizer
        self.concat_token_id = tokenizer.eos_token_id if tokenizer.eos_token_id else args.eos_token_id
        self.dataset = dataset
        self.seq_length = seq_length
        self.infinite = infinite
        self.current_size = 0
        self.max_buffer_size = seq_length * chars_per_token * num_of_sequences

    def __iter__(self):
        iterator = self.dataset.iterrows()
        more_examples = True
        while more_examples:
            buffer, buffer_len = [], 0
            while True:
                if buffer_len >= self.max_buffer_size:
                    break
                try:
                    buffer.append(prepare_sample_text(next(iterator)[1]))
                    buffer_len += len(buffer[-1])
                except StopIteration:
                    if self.infinite:
                        iterator = iter(self.dataset)
                    else:
                        more_examples = False
                        break
            tokenized_inputs = self.tokenizer(buffer, truncation=False)["input_ids"]
            all_token_ids = []
            for tokenized_input in tokenized_inputs:
                all_token_ids.extend(tokenized_input + [self.concat_token_id])
            for i in range(0, len(all_token_ids), self.seq_length):
                input_ids = all_token_ids[i : i + self.seq_length]
                if len(input_ids) == self.seq_length:
                    self.current_size += 1
                    yield {
                        "input_ids": torch.LongTensor(input_ids),
                        "labels": torch.LongTensor(input_ids),
                    }


In [8]:
tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)

In [11]:
dataset = ConstantLengthDataset(tokenizer, data[:100], infinite=False)

In [12]:
for i in dataset:
    print(i)

index                                                          0
course_code                                          OIEO.06.046
question            What is the title of the course OIEO.06.046?
answer         The title of the course OIEO.06.046 is Private...
Name: 0, dtype: object
index                                                          1
course_code                                          OIEO.06.046
question             What is the name of the course OIEO.06.046?
answer         The name of the course OIEO.06.046 is Private ...
Name: 1, dtype: object
index                                                          2
course_code                                          OIEO.06.046
question       What is the code of the course Private Interna...
answer         The code for the course Private International ...
Name: 2, dtype: object
index                                                          3
course_code                                          OIEO.06.046
question             

In [32]:
chars_token_ratio(data[:100], tokenizer)

 25%|██▌       | 100/400 [00:00<00:00, 526.88it/s]

index                                                          0
course_code                                          OIEO.06.046
question            What is the title of the course OIEO.06.046?
answer         The title of the course OIEO.06.046 is Private...
Name: 0, dtype: object
index                                                          1
course_code                                          OIEO.06.046
question             What is the name of the course OIEO.06.046?
answer         The name of the course OIEO.06.046 is Private ...
Name: 1, dtype: object
index                                                          2
course_code                                          OIEO.06.046
question       What is the code of the course Private Interna...
answer         The code for the course Private International ...
Name: 2, dtype: object
index                                                          3
course_code                                          OIEO.06.046
question             




3.257748776508972

# Convert fields into sentences

In [5]:
tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)              

In [25]:
inputs = tokenizer("btw 512 here essentialy means that we include only 512 tokens from the document into the embedding, right?")
len(inputs['input_ids'])

29

In [26]:
df = pd.read_pickle(f'{ROOT}/data/course_info.pkl')
df.head()

Unnamed: 0,uuid,code,parent_uuid,parent_code,parent_credits,title_en,general_input_languages,general_structural_unit_shares,general_year.en,general_type.code,...,resources_mandatory_materials,resources_recommended_materials,resources_learning_environments,participants_lecturers,participants_assistants,schedule_entries,schedule_weeks.et,registration_info_min_students,registration_info_max_students,registration_info_audience.en
0,a198ed66-1fb5-4f7e-ee43-7fbbf5c09aca,sv-2023-spring-openuniv,b99c0bb1-efd4-9b0a-857a-3dc7114e5c19,OIEO.06.046,6.0,Private International Law,"[{'language_code': 'et', 'language_name': 'Est...","[{'code': 'SVOI04', 'name': 'Department of Pri...",2023/2024,regular,...,,,,[{'person_uuid': 'd7a3f19b-d7c7-fbe5-b41b-e5e3...,,"[{'course_week': 1, 'work_type': {'code': 'lec...",Nädalad,1.0,60.0,
1,6ee943ab-a839-a937-76c0-2e0e4daedb8b,,76162416-d608-f48f-ec5d-5c40ce9b320d,FLFI.00.016,15.0,Doctoral Seminar,"[{'language_code': 'et', 'language_name': 'Est...","[{'code': 'HVFI01', 'name': 'Department of Phi...",2023/2024,regular,...,Presentations.,Ask the supervisor.,,[{'person_uuid': '4b6a00ae-35fd-a00e-d38e-0e2f...,,"[{'work_type': {'code': 'colloquium', 'et': 'k...",24-40,1.0,,
2,7fc4f6cf-f011-91e8-4c42-abbd782a4a2a,,31c327d5-2b61-c764-b418-bda22c577265,SVNC.00.179,4.0,Pedagogical Practicum,"[{'language_code': 'et', 'language_name': 'Est...","[{'code': 'SVNC', 'name': 'Narva College', 'co...",2023/2024,practice,...,Põhikooli- ja gümnaasiumi Riiklik õppekava htt...,,,[{'person_uuid': 'e664d700-4a63-b159-794e-d0be...,,"[{'work_type': {'code': 'practice', 'et': 'pra...",24-43,15.0,46.0,
3,d72f2ef7-d264-eceb-b759-a9a66cc27593,,0e7d0b5d-83ea-f260-7e09-c3d59ea9c250,KKSB.05.092,3.0,Practice in the Work Environment,"[{'language_code': 'et', 'language_name': 'Est...","[{'code': 'MVSF', 'name': 'Institute of Sport ...",2022/2023,regular,...,,,,[{'person_uuid': '1ff846ac-79c1-ef64-3910-5131...,,"[{'time': '2023-06-20', 'work_type': {'code': ...",40-52,1.0,80.0,
4,2a69334b-ebec-b332-d5f2-984869620c04,,f465e112-e552-a3d1-5fa6-e26e661b288b,MTAT.03.242,12.0,Bioinformatics Seminar,"[{'language_code': 'et', 'language_name': 'Est...","[{'code': 'LTAT02', 'name': 'Chair of Data Sci...",2023/2024,regular,...,The readings for every topic could be found on...,The readings for every topic could be found on...,,[{'person_uuid': '643ca845-067b-f270-23fd-dafe...,,"[{'work_type': {'code': 'seminar', 'et': 'semi...",24.-39. õppenädalal,5.0,,


In [38]:
df[df["title_en"] == "Advanced Seminar in Ancient History"]["overview_description.en"]

2750    '
Name: overview_description.en, dtype: object

In [27]:
def get_len():
    lens = []
    for i, row in df.iterrows():
        d = get_document(row, column_names, column_func)
        if i == 0:
            print(d)
        lens.append(len(tokenizer(d)['input_ids']))
    return lens

In [28]:
def get_document(row, cnames=[], cfunc=[]):
    try:
        s = ''.join([cf(row[cn]) for cn, cf in zip(cnames, cfunc)])
    except Exception as e:
        print(e)
    return s

In [39]:
from collections.abc import Iterable

column_names = ["title_en", 
                "overview_objectives",
                "overview_learning_outcomes", 
                "overview_description.en",
                "general_input_languages",
                "general_structural_unit_shares",
                "general_year.en",
                'general_type.en',
                'additional_info_study_levels',
                'additional_info_hours.lecture',
                'additional_info_hours.practice',
                'additional_info_hours.seminar',
                'additional_info_hours.independent_work',
                'additional_info_hours.individual_work',
                'additional_info_hours.internship',
                'parent_credits',
                # 'participants_lecturers',
                'participants_lecturers',
                'participants_assistants'
               ]
column_func = [lambda x: f"The name of the course is {str(x)}. ", 
               lambda x: f"The purpose of the course is  {str(x[0])} ".replace('\n', '') if x and isinstance(x[0], str) and len(x[0]) > 5 else '', 
               lambda x: str(x[0]).replace('\n', '') if x and isinstance(x[0], str) and len(x[0]) > 5 else '',
               lambda x: str(x).strip("[").strip("]").replace('\n', '') if x and isinstance(x, str) else '',
               lambda x: f"Language of instruction is {str(x[0]['language_name'])}. " if x else '',
               lambda x: f"The course is offered by {str(x[0]['name'])}. " if x else '',
               lambda x: f"The course is taught in {str(x)} years. " if x else '',
               lambda x: f"The course is a {str(x)}. " if x else '',
               lambda x: f"The course is offered for the folowing study levels: {','.join([d['code']for d in x])}. " if x else '',
               lambda x: f"Number of lecture hours is {str(x)}. " if x and x != 0 else '',
               lambda x: f"Number of practice hours is {str(x)}. " if x and x != 0 else '',
               lambda x: f"Number of seminar hours is {str(x)}. " if x and x != 0 else '',
               lambda x: f"Number of independent work hours is {str(x)}. " if x and x != 0 else '',
               lambda x: f"Number of individual hours is {str(x)}. " if x and x != 0 else '',
               lambda x: f"Number of internship hours is {str(x)}. " if x and x != 0 else '',
               lambda x: f"Number of credits is {str(x)}. " if x and x != 0 else '',
               # lambda x: f"The person responsible for the course is {','.join([d['person_name'] for d in x if x['is_responsible']])}." if x and isinstance(x, Iterable) else '',
               lambda x: f"The course lecturers are: {', '.join([d['person_name'] for d in x])}. " if x and isinstance(x, Iterable) else '',
               lambda x: f"The course assistants are: {', '.join([d['person_name'] for d in x])}. " if x and isinstance(x, Iterable) and isinstance(x, dict) else '',
    ]

In [None]:
l = get_len()
plt.hist(l, bins=100)
plt.show()

In [40]:
import functools

df['all_course_info'] = df.apply(functools.partial(get_document, cnames=column_names, cfunc=column_func), axis=1)

df["title_en"] = df["title_en"].astype("str")
df[['title_en', 'all_course_info']].to_csv(os.path.join(ROOT, 'data/courses_info_all.csv'), index=False)

In [None]:
df[""]

In [None]:
df.head()

In [77]:
sum(df['overview_notes.en'].isna())

2934

In [123]:
df['participants_lecturers'][100]

[{'person_uuid': 'd4ea22ef-39bc-373e-9f54-23be334c1f5f',
  'person_name': 'Heli Nurme',
  'structural_unit': {'code': 'HVVK16',
   'name': {'en': 'Academic Affairs Specialists', 'et': 'Õppekorraldajad'},
   'address': 'Posti 1\n71004 Viljandi linn, \nViljandimaa \nEST',
   'city': 'Viljandi linn',
   'street': 'Posti 1',
   'zip': '71004',
   'level': 3,
   'academic': False,
   'supports_continuous_learning': False,
   'parent_code': 'HVVK'},
  'is_responsible': False,
  'is_teaching': True},
 {'person_uuid': '9d3d8dd7-3fbd-772f-9eb7-df07f0e58fdf',
  'person_name': 'Jorma Sarv',
  'structural_unit': {'code': 'HVVK02',
   'name': {'en': 'Academic Staff', 'et': 'Akadeemilised töötajad'},
   'address': 'Posti 1\n71004 Viljandi linn, \nViljandimaa \nEST',
   'city': 'Viljandi linn',
   'street': 'Posti 1',
   'zip': '71004',
   'level': 3,
   'academic': False,
   'supports_continuous_learning': False,
   'parent_code': 'HVVK'},
  'is_responsible': True,
  'is_teaching': False},
 {'person

In [126]:
df['participants_assistants'][900]

nan

In [156]:
df['resources_is_moodle_connected'].value_counts()

False    2069
True      870
Name: resources_is_moodle_connected, dtype: int64

In [86]:
get_d_len('additional_info_study_levels')

[1,
 1,
 2,
 1,
 2,
 1,
 2,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 5,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 0,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 2,
 2,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 2,
 4,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 2,
 1,
 2,
 1,
 1,
 1,
 5,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 2,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 2,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 2,
 2,
 1,
 1,
 5,
 2,
 2,
 1,
 1,
 0,
 1,
 3,
 1,
 1,
 1,
 1,
 0,
 0,
 2,
 1,
 1,
 1,
 3,
 1,
 2,
 1,
 1,
 1,
 3,
 0,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 2,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 2,
 4,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 2,
 4,
 1,
 1,
 0,
 1,
 1,
 1,
 2,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 3,
 1,
 1,
 1,
 1,
 2,
 1,
 3,
 1,
 1,
 1,
 2,
 2,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 3,
 2,
 3,
 3,
 1,
 5,
 1,
 1,
 5,
 5,
 1,
 1,
 1,
 1,
 1,
 2,
 2,
 1,
 1,
 1,
 1,
 1,
 4,
 2,
 1,
 1,
 2,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 5,
 1,


In [84]:
def get_d_len(cn):
    l = []
    for i, r in df.iterrows():
        l.append(len(r[cn]))
    return l

overview_notes.en - nan


In [9]:
for c, dt in zip(df.columns, df.dtypes):
    print(c + ": " + str(dt))

uuid: object
code: object
parent_uuid: object
parent_code: object
parent_credits: float64
title_en: object
general_input_languages: object
general_structural_unit_shares: object
general_year.en: object
general_type.code: object
general_type.en: object
additional_info_is_vota_course: bool
additional_info_is_continuous_learning_course: bool
additional_info_study_levels: object
additional_info_is_enlight_course: object
additional_info_hours.lecture: float64
additional_info_hours.practice: int64
additional_info_hours.seminar: int64
additional_info_hours.independent_work: float64
additional_info_hours.individual_work: float64
additional_info_hours.internship: float64
additional_info_language_training.language.code: object
additional_info_language_training.language.en: object
additional_info_language_training.start_level.code: object
additional_info_language_training.end_level.code: object
overview_study_languages: object
overview_other_languages: object
overview_objectives: object
overview_

In [12]:
df["general_input_languages"][60]

[{'language_code': 'et', 'language_name': 'Estonian'}]