In [1]:
import pandas as pd
from tqdm.notebook import tqdm
import numpy as np

In [2]:
shared_dir = '/gpfs/space/projects/stud_ml_22/NLP'

In [3]:
course_info_df = pd.read_pickle(f'{shared_dir}/data/course_info.pkl')

In [4]:
from collections.abc import Iterable

column_names = ["title_en", 
                'parent_code',
                "overview_objectives",
                "overview_learning_outcomes", 
                "overview_description.en",
                "overview_study_languages",
                "general_structural_unit_shares",
                "general_year.en",
                'general_type.en',
                'additional_info_study_levels',
                'additional_info_hours.lecture',
                'additional_info_hours.practice',
                'additional_info_hours.seminar',
                'additional_info_hours.independent_work',
                'additional_info_hours.individual_work',
                'additional_info_hours.internship',
                'parent_credits',
                'participants_lecturers',
                'participants_assistants',
                'additional_info_is_vota_course',
                'target_faculty.name.en',
                'target_faculty.address',
                'grading_independent_work_assessments',
                'grading_grade_evaluation.et',
                'grading_assessment_scale.en',
                'grading_independent_work.et',
               ]
column_func = [lambda x: f"The name of the course is {str(x)}. ", 
               lambda x: f'The code of it is {str(x)}.' if x else '',
               lambda x: f"The purpose of the course is  {str(x[0])} ".replace('\n', '') if x and isinstance(x[0], str) and len(x[0]) > 5 else '', 
               lambda x: str(x[0]).replace('\n', '') if x and isinstance(x[0], str) and len(x[0]) > 5 else '',
               lambda x: str(x).strip("[").strip("]").replace('\n', '') if x and isinstance(x, str) else '',
               lambda x: f"Language of instruction is {str(x[0]['name'])}. " if x else '',
               lambda x: f"The course is offered by {str(x[0]['name'])}. " if x else '',
               lambda x: f"The course is taught in {str(x)} years. " if x else '',
               lambda x: f"The course is a {str(x)}. " if x else '',
               lambda x: f"The course is offered for the folowing study levels: {','.join([d['code']for d in x])}. " if x else '',
               lambda x: f"Number of lecture hours is {str(x)}. " if x and x != 0 else '',
               lambda x: f"Number of practice hours is {str(x)}. " if x and x != 0 else '',
               lambda x: f"Number of seminar hours is {str(x)}. " if x and x != 0 else '',
               lambda x: f"Number of independent work hours is {str(x)}. " if x and x != 0 else '',
               lambda x: f"Number of individual hours is {str(x)}. " if x and x != 0 else '',
               lambda x: f"Number of internship hours is {str(x)}. " if x and x != 0 else '',
               lambda x: f"Number of credits is {str(x)}. " if x and x != 0 else '',
               lambda x: f"The course lecturers are: {', '.join([d['person_name'] for d in x])}. " if x and isinstance(x, Iterable) else '',
               lambda x: f"The course assistants are: {', '.join([d['person_name'] for d in x])}. " if x and isinstance(x, Iterable) and isinstance(x, dict) else '',
               lambda x: f'Prior learning can be recognised for this course. ' if x else 'Prior learning can\'t be recognised for this course. ',
               lambda x: f'The course is offered by {x}. ' if x else '',
               lambda x: f'The faculty is located in {x}. ' if x else '',
               lambda x: f'The graded tasks for the students include {", ".join([item["work_type_name"] for item in x])}.' if x else '',
               lambda x: f'{x}. ',
               lambda x: f'The grading is {x}. ' if x else '',
               lambda x: f'The individual work includes {x}. ' if x else '',
    ]

In [5]:
def get_document(row, cnames=[], cfunc=[]):
    try:
        s = ''.join([cf(row[cn]) for cn, cf in zip(cnames, cfunc)])
    except Exception as e:
        print(e)
    return s

In [6]:
import functools

course_info_df['all_course_info'] = course_info_df.apply(functools.partial(get_document, cnames=column_names, cfunc=column_func), axis=1)

course_info_df["title_en"] = course_info_df["title_en"].astype("str")

In [8]:
course_info_df[course_info_df['parent_code'] == 'LTAT.01.001'].iloc[0].to_dict()

{'uuid': '64a12f89-bbf0-532f-c768-3cee36b7c7d2',
 'code': nan,
 'parent_uuid': 'b68ce1ed-0947-b1f8-3543-28350e98a007',
 'parent_code': 'LTAT.01.001',
 'parent_credits': 6.0,
 'title_en': 'Natural Language Processing',
 'general_input_languages': [{'language_code': 'et',
   'language_name': 'Estonian'}],
 'general_structural_unit_shares': [{'code': 'LTAT01',
   'name': 'Chair of Natural Language Processing',
   'course_share_percent': 100,
   'coordinating': True}],
 'general_year.en': '2023/2024',
 'general_type.code': 'regular',
 'general_type.en': 'Regular course',
 'additional_info_is_vota_course': True,
 'additional_info_is_continuous_learning_course': False,
 'additional_info_study_levels': [{'code': 'master',
   'title': "master's studies"},
  {'code': 'doctoral', 'title': 'doctoral studies'}],
 'additional_info_is_enlight_course': nan,
 'additional_info_hours.lecture': 20.0,
 'additional_info_hours.practice': 28,
 'additional_info_hours.seminar': 12,
 'additional_info_hours.inde

In [98]:
course_info_df[['title_en', 'all_course_info']].to_csv(f'{shared_dir}/data/new_documents.csv', index=False)

In [45]:
documents = pd.DataFrame(course_info_df[['all_course_info']].values, columns=['text'])

In [46]:
documents

Unnamed: 0,text
0,The name of the course is Private Internationa...
1,The name of the course is Doctoral Seminar. Th...
2,The name of the course is Pedagogical Practicu...
3,The name of the course is Practice in the Work...
4,The name of the course is Bioinformatics Semin...
...,...
2934,The name of the course is Master's Thesis. The...
2935,The name of the course is Practical Russian. T...
2936,The name of the course is Security Perspective...
2937,The name of the course is Graduation Exam. The...


In [16]:
import json

In [None]:
exploded_lecturers = course_info_df.explode('participants_lecturers')
exploded_lecturers['participants_lecturers'] = exploded_lecturers['participants_lecturers'].apply(lambda x: x['person_name'] if type(x) == dict else '')
exploded_lecturers = exploded_lecturers.groupby('participants_lecturers').agg({'title_en': list}).reset_index()

In [None]:
exploded_lecturers[exploded_lecturers['participants_lecturers'] == 'Dmytro Fishman']

In [68]:
len(exploded_lecturers)

2779

In [69]:
exploded_lecturers = exploded_lecturers[exploded_lecturers['participants_lecturers'] != '']

In [70]:
len(exploded_lecturers)

2778

In [71]:
exploded_lecturers.head()

Unnamed: 0,participants_lecturers,title_en
1,Aare Abroi,[Virology]
2,Aare Luts,"[Advanced Course of Atmospheric Sciences, Prac..."
3,Aare Märtson,"[Traumatology and Orthopaedics, Traumatology a..."
4,Aare Ristikivi,[Basics and Methodology of Teaching Social Sci...
5,Aarne Kasikov,[Testing and Investigation Methods in Material...


In [72]:
exploded_lecturers['title_en'] = exploded_lecturers['title_en'].apply(lambda x: ', '.join(x))

In [73]:
exploded_lecturers['document'] = exploded_lecturers["participants_lecturers"].astype("str") + ' teaches ' + exploded_lecturers["title_en"].astype("str")

In [74]:
exploded_lecturers.head()

Unnamed: 0,participants_lecturers,title_en,document
1,Aare Abroi,Virology,Aare Abroi teaches Virology
2,Aare Luts,"Advanced Course of Atmospheric Sciences, Pract...",Aare Luts teaches Advanced Course of Atmospher...
3,Aare Märtson,"Traumatology and Orthopaedics, Traumatology an...",Aare Märtson teaches Traumatology and Orthopae...
4,Aare Ristikivi,Basics and Methodology of Teaching Social Scie...,Aare Ristikivi teaches Basics and Methodology ...
5,Aarne Kasikov,Testing and Investigation Methods in Materials...,Aarne Kasikov teaches Testing and Investigatio...


In [76]:
exploded_lecturers[exploded_lecturers['participants_lecturers'] == 'Dmytro Fishman']['document'].values[0]

'Dmytro Fishman teaches Didactic Practice, Special Course in Machine Learning'

In [77]:
documents = documents.append(pd.DataFrame(exploded_lecturers['document'].values, columns=['text']))

  documents = documents.append(pd.DataFrame(exploded_lecturers['document'].values, columns=['text']))


In [80]:
documents.to_csv(f'{shared_dir}/data/documents_with_professors.csv', index=False)