In [205]:
import pandas as pd
from tqdm import tqdm
import numpy as np

In [206]:
course_info_df = pd.read_pickle('course_info.pkl')

Questionable fields:
* general_input_languages


In [207]:
course_info_df[course_info_df['parent_code'] == 'LTAT.01.001'].iloc[0].to_dict()

{'uuid': '64a12f89-bbf0-532f-c768-3cee36b7c7d2',
 'code': nan,
 'parent_uuid': 'b68ce1ed-0947-b1f8-3543-28350e98a007',
 'parent_code': 'LTAT.01.001',
 'parent_credits': 6.0,
 'title_en': 'Natural Language Processing',
 'general_input_languages': [{'language_code': 'et',
   'language_name': 'Estonian'}],
 'general_structural_unit_shares': [{'code': 'LTAT01',
   'name': 'Chair of Natural Language Processing',
   'course_share_percent': 100,
   'coordinating': True}],
 'general_year.en': '2023/2024',
 'general_type.code': 'regular',
 'general_type.en': 'Regular course',
 'additional_info_is_vota_course': True,
 'additional_info_is_continuous_learning_course': False,
 'additional_info_study_levels': [{'code': 'master',
   'title': "master's studies"},
  {'code': 'doctoral', 'title': 'doctoral studies'}],
 'additional_info_is_enlight_course': nan,
 'additional_info_hours.lecture': 20.0,
 'additional_info_hours.practice': 28,
 'additional_info_hours.seminar': 12,
 'additional_info_hours.inde

In [208]:
course_info_df['additional_info_is_enlight_course'].value_counts()

False    449
True      83
Name: additional_info_is_enlight_course, dtype: int64

In [209]:
class CourseQuestion():
    def __init__(self, course_code, question, answer):
        self.course_code = course_code
        self.question = question
        self.answer = answer
        
    def to_dict(self):
        return {
            'course_code': self.course_code,
            'question': self.question,
            'answer': self.answer
        }

In [210]:
def generate_title_code_questions(course_info):
    questions = [
        CourseQuestion(course_info['parent_code'], f'What is the title of the course {course_info["parent_code"]}?', f'The title of the course {course_info["parent_code"]} is {course_info["title_en"]}.'),
        CourseQuestion(course_info['parent_code'], f'What is the name of the course {course_info["parent_code"]}?', f'The name of the course {course_info["parent_code"]} is {course_info["title_en"]}.'),
        CourseQuestion(course_info['parent_code'], f'What is the code of the course {course_info["title_en"]}?', f'The code for the course {course_info["title_en"]} is {course_info["parent_code"]}.'),
        CourseQuestion(course_info['parent_code'], f'How is the course {course_info["parent_code"]} called?', f'The course {course_info["parent_code"]} is called {course_info["title_en"]}.'),
    ]
    return questions

def generate_credits_questions(course_info):
    
    credit_num = int(course_info["parent_credits"])
    
    questions = [
        CourseQuestion(course_info['parent_code'], f'How many credits does the course {course_info["parent_code"]} have?', f'The course {course_info["parent_code"]} has {credit_num} credits.'),
        CourseQuestion(course_info['parent_code'], f'How many credits is the course {course_info["parent_code"]} worth?', f'The course {course_info["parent_code"]} is worth {credit_num} credits.'),
        CourseQuestion(course_info['parent_code'], f'How many credits is the course {course_info["parent_code"]}?', f'The course {course_info["parent_code"]} is worth {credit_num} credits.'),
        CourseQuestion(course_info['parent_code'], f'How many credits can I get for the course {course_info["parent_code"]}?', f'You can get {credit_num} credits for the course {course_info["parent_code"]}.'),
        CourseQuestion(course_info['parent_code'], f'How many credits does the course {course_info["title_en"]} have?', f'The course {course_info["title_en"]} has {credit_num} credits.'),
        CourseQuestion(course_info['parent_code'], f'How many credits is the course {course_info["title_en"]} worth?', f'The course {course_info["title_en"]} is worth {credit_num} credits.'),
        CourseQuestion(course_info['parent_code'], f'How many credits is the course {course_info["title_en"]}?', f'The course {course_info["title_en"]} is worth {credit_num} credits.'),
        CourseQuestion(course_info['parent_code'], f'How many credits can I get for the course {course_info["title_en"]}?', f'You can get {credit_num} credits for the course {course_info["title_en"]}.'),
    ]
    
    return questions

def generate_structural_unit_questions(course_info):
    
    structural_unit_name = course_info['general_structural_unit_shares'][0]['name']
    structural_unit_code = course_info['general_structural_unit_shares'][0]['code']
    
    questions = [
        CourseQuestion(course_info['parent_code'], f'What is the structural unit of the course {course_info["parent_code"]}?', f'The structural unit of the course {course_info["parent_code"]} is {structural_unit_name} ({structural_unit_code}).'),
        CourseQuestion(course_info['parent_code'], f'What is the structural unit of the course {course_info["title_en"]}?', f'The structural unit of the course {course_info["title_en"]} is {structural_unit_name} ({structural_unit_code}).'),
        CourseQuestion(course_info['parent_code'], f'What structural unit does the course {course_info["parent_code"]} belong to?', f'The structural unit of the course {course_info["parent_code"]} is {structural_unit_name} ({structural_unit_code}).'),
        CourseQuestion(course_info['parent_code'], f'What structural unit does the course {course_info["title_en"]} belong to?', f'The structural unit of the course {course_info["title_en"]} is {structural_unit_name} ({structural_unit_code}).'),
        CourseQuestion(course_info['parent_code'], f'Which departament does the course {course_info["parent_code"]} belong to?', f'The structural unit of the course {course_info["parent_code"]} is {structural_unit_name} ({structural_unit_code}).'),
        CourseQuestion(course_info['parent_code'], f'Which departament does the course {course_info["title_en"]} belong to?', f'The structural unit of the course {course_info["title_en"]} is {structural_unit_name} ({structural_unit_code}).'),
    ]
    
    return questions

def generate_year_questions(course_info):
    questions = [
        CourseQuestion(course_info['parent_code'], f'What year is the course {course_info["parent_code"]} taught?', f'The course {course_info["parent_code"]} is tought in the {course_info["general_year.en"]} academic year.'),
        CourseQuestion(course_info['parent_code'], f'What year is the course {course_info["title_en"]} taught?', f'The course {course_info["title_en"]} is tought in the {course_info["general_year.en"]} academic year.'),
        CourseQuestion(course_info['parent_code'], f'When was the course {course_info["parent_code"]} last taught?', f'The course {course_info["parent_code"]} was tought in the {course_info["general_year.en"]} academic year.'),
        CourseQuestion(course_info['parent_code'], f'When was the course {course_info["title_en"]} last taught?', f'The course {course_info["title_en"]} was tought in the {course_info["general_year.en"]} academic year.'),
        CourseQuestion(course_info['parent_code'], f'Was the course {course_info["parent_code"]} taught last year?', f'The course {course_info["parent_code"]} was tought in the {course_info["general_year.en"]} academic year.'),
        CourseQuestion(course_info['parent_code'], f'Was the course {course_info["title_en"]} taught last year?', f'The course {course_info["title_en"]} was tought in the {course_info["general_year.en"]} academic year.'),
        CourseQuestion(course_info['parent_code'], f'was the course {course_info["parent_code"]} taught in the academic year of {course_info["general_year.en"]}?', f'Yes, the course {course_info["parent_code"]} was tought in the {course_info["general_year.en"]} academic year.'),
        CourseQuestion(course_info['parent_code'], f'was the course {course_info["title_en"]} taught in the academic year of {course_info["general_year.en"]}?', f'Yes, the course {course_info["title_en"]} was tought in the {course_info["general_year.en"]} academic year.'),
    ]
    
    return questions

def generate_course_type_question(course_info, course_types):
    given_course_type = course_info['general_type.en'].lower()
    other_course_types = [course_type.lower() for course_type in course_types if course_type.lower() != given_course_type]
    
    questions = [
        CourseQuestion(course_info['parent_code'], f'What kind of course is {course_info["parent_code"]}?', f'The course {course_info["parent_code"]} is a {given_course_type}.'),
        CourseQuestion(course_info['parent_code'], f'What kind of course is {course_info["title_en"]}?', f'The course {course_info["title_en"]} is a {given_course_type}.'),
        CourseQuestion(course_info['parent_code'], f'Is the course {course_info["parent_code"]} a {given_course_type}?', f'Yes, the course {course_info["parent_code"]} is a {given_course_type}.'),
        CourseQuestion(course_info['parent_code'], f'Is the course {course_info["title_en"]} a {given_course_type}?', f'Yes, the course {course_info["title_en"]} is a {given_course_type}.'),
    ]
    
    for other_type in other_course_types:
        questions.extend([
            CourseQuestion(course_info['parent_code'], f'Is {course_info["parent_code"]} a {other_type}?', f'No, the course {course_info["parent_code"]} is a {given_course_type}.'),
            CourseQuestion(course_info['parent_code'], f'Is {course_info["title_en"]} a {other_type}?', f'No, the course {course_info["title_en"]} is a {given_course_type}.'),
        ])
    
    return questions

def generate_rpl_questions(course_info):
    supports_rpl = bool(course_info['additional_info_is_vota_course'])
    if supports_rpl:
        return [
            CourseQuestion(course_info['parent_code'], f'Can prior learning be recognised for the course {course_info["parent_code"]}?', f'Yes, prior learning can be recognised for the course {course_info["parent_code"]}.'),
            CourseQuestion(course_info['parent_code'], f'Can prior learning be recognised for the course {course_info["title_en"]}?', f'Yes, prior learning can be recognised for the course {course_info["title_en"]}.'),
            CourseQuestion(course_info['parent_code'], f'Can my previous experience be recognized for the course {course_info["parent_code"]}?', f'Yes, prior learning can be recognised for the course {course_info["parent_code"]}.'),
            CourseQuestion(course_info['parent_code'], f'Can my previous experience be recognized for the course {course_info["title_en"]}?', f'Yes, prior learning can be recognised for the course {course_info["title_en"]}.'),
        ]
    else:
        return [
            CourseQuestion(course_info['parent_code'], f'Can prior learning be recognised for the course {course_info["parent_code"]}?', f'No, unfortunately prior learning cannot be recognised for the course {course_info["parent_code"]}.'),
            CourseQuestion(course_info['parent_code'], f'Can prior learning be recognised for the course {course_info["title_en"]}?', f'No, unfortunately prior learning cannot be recognised for the course {course_info["title_en"]}.'),
            CourseQuestion(course_info['parent_code'], f'Can my previous experience be recognized for the course {course_info["parent_code"]}?', f'No, unfortunately prior learning cannot be recognised for the course {course_info["parent_code"]}.'),
            CourseQuestion(course_info['parent_code'], f'Can my previous experience be recognized for the course {course_info["title_en"]}?', f'No, unfortunately prior learning cannot be recognised for the course {course_info["title_en"]}.'),
        ]
        
def generate_continuous_education_questions(course_info):
    supports_continuous_education = bool(course_info['additional_info_is_continuous_learning_course'])
    if supports_continuous_education:
        return [
            CourseQuestion(course_info['parent_code'], f'Can the course {course_info["parent_code"]} be taken by continuing education learners?', f'Yes, the course {course_info["parent_code"]} can be taken as continuous education.'),
            CourseQuestion(course_info['parent_code'], f'Can the course {course_info["title_en"]} be taken by continuing education learners?', f'Yes, the course {course_info["title_en"]} can be taken as continuous education.'),
        ]
    else:
        return [
            CourseQuestion(course_info['parent_code'], f'Can the course {course_info["parent_code"]} be taken by continuing education learners?', f'No, unfortunately the course {course_info["parent_code"]} cannot be taken as continuous education.'),
            CourseQuestion(course_info['parent_code'], f'Can the course {course_info["title_en"]} be taken by continuing education learners?', f'No, unfortunately the course {course_info["title_en"]} cannot be taken as continuous education.')
        ]
        
def generate_study_levels_questions(course_info, all_study_levels):
    given_study_levels = [level['title'] for level in course_info['additional_info_study_levels']]
    if len(given_study_levels) == 0:
        return [] # No study levels data available
    missing_study_levels = [study_level for study_level in all_study_levels if study_level not in given_study_levels]
    
    questions = [
        CourseQuestion(course_info['parent_code'], f'What levels of study is the course {course_info["parent_code"]} offered for?', f'The course {course_info["parent_code"]} is offered for the following levels of study: {", ".join(given_study_levels)}.'),
        CourseQuestion(course_info['parent_code'], f'What levels of study is the course {course_info["title_en"]} offered for?', f'The course {course_info["title_en"]} is offered for the following levels of study: {", ".join(given_study_levels)}.'),
    ]
    
    for study_level in given_study_levels:
        questions.extend([
            CourseQuestion(course_info['parent_code'], f'Is {course_info["parent_code"]} offered for {study_level}?', f'Yes, {course_info["parent_code"]} is offered for {study_level}.'),
            CourseQuestion(course_info['parent_code'], f'Is {course_info["title_en"]} offered for {study_level}?', f'Yes, {course_info["title_en"]} is offered for {study_level}.'),
        ])
    
    for study_level in missing_study_levels:
        questions.extend([
            CourseQuestion(course_info['parent_code'], f'Is {course_info["parent_code"]} offered for {study_level}?', f'No, {course_info["parent_code"]} is not offered for {study_level}.'),
            CourseQuestion(course_info['parent_code'], f'Is {course_info["title_en"]} offered for {study_level}?', f'No, {course_info["title_en"]} is not offered for {study_level}.'),
        ])
    
    return questions
    

In [211]:
class CourseGenerationData():
    def __init__(self, course_types, study_levels):
        self.course_types = course_types
        self.study_levels = study_levels

In [212]:
course_types = course_info_df['general_type.en'].unique().tolist()

study_levels = course_info_df['additional_info_study_levels'].apply(lambda x: [item['title'] for item in x]).tolist()
study_levels = [item for sublist in study_levels for item in sublist]
study_levels = np.unique(study_levels).tolist()

generation_data = CourseGenerationData(course_types, study_levels)

In [213]:
def generate_questions(course_info, generation_data):
    questions = []
    
    questions.extend(generate_title_code_questions(course_info))
    questions.extend(generate_credits_questions(course_info))
    questions.extend(generate_structural_unit_questions(course_info))
    questions.extend(generate_year_questions(course_info))
    questions.extend(generate_course_type_question(course_info, generation_data.course_types))
    questions.extend(generate_rpl_questions(course_info))
    questions.extend(generate_continuous_education_questions(course_info))
    questions.extend(generate_study_levels_questions(course_info, generation_data.study_levels))
    
    questions_df = pd.DataFrame.from_records([course.to_dict() for course in questions])
    
    return questions_df

In [214]:
question_dataframes = []

for i in tqdm(range(len(course_info_df))):
    question_dataframes.append(generate_questions(course_info_df.iloc[i], generation_data))
    
question_df = pd.concat(question_dataframes)
question_df.describe()


100%|██████████| 2939/2939 [00:07<00:00, 403.72it/s]


Unnamed: 0,course_code,question,answer
count,157710,157710,157710
unique,2939,149657,111994
top,OIEO.06.046,Is Master's Thesis offered for integrated bach...,"No, the course Master's Thesis is a course wit..."
freq,54,35,102


In [215]:
question_df.to_pickle('course_questions.pkl')