In [1]:
import os
# Move to Thesis directory (two levels up)
os.chdir(os.path.abspath(os.path.join("..", "..")))

# Move to model/src if it exists
model_dir = os.path.join(os.getcwd(), "model", "src")
if os.path.exists(model_dir):
    os.chdir(model_dir)

print("Current Directory:", os.getcwd())

Current Directory: c:\Users\1176153\Downloads\github\Thesis\model\src


In [2]:
import pandas as pd
import re
import libs.data_understanding as du
import libs.data_preparation as dp
from libs.settings import data_catalog as dc
from libs import data_handeling as dh

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\1176153\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
bachelor_data_raw = dh.load_pickle_to_dict(dc.BACHELORS_DATA_RAW)
bachelor_data_raw

{'bachelor_data-science_main_course_text.txt': {'text': "Text from https://www.novaims.unl.pt/en/education/programs/bachelor-s-degrees/data-science/:\nData Science\nDegree in\nData Science\nen\nEducation\nPrograms\nBachelor's Degrees\nData Science\nIn the Bachelor´s Degree in Data Science, students learn the most modern techniques of artificial intelligence and machine learning to analyze large volumes of data (Big Data).\nThey will become true data scientists - considered the sexiest profession of the 21\nst\ncentury by the Harvard Business Review.\nThe main objective of this course is to train future professionals capable of understanding, developing and using models, algorithms and the most advanced techniques in data science, to analyze and extract knowledge from Big Data.\nThe 3\nrd\nphase of applications under the International Student Statute for the 2025/26 academic year are open from February 26\nth\nto March 27\nth\n, 2025.\nDuration\n3 years (6 semesters)\nTimetable\nDaytime

# Cleaning Teaching Staff files

## Deleting words

In [4]:
dh.print_text_context_from_program_dicts(
    data=bachelor_data_raw,
    doc_types_to_include=["teaching_staff"]
)


--- Document: bachelor_data-science_teaching-staff_text.txt ---
Course Name: Data Science
Document Type: teaching_staff

Text Content:
Text from https://www.novaims.unl.pt/en/education/programs/bachelor-s-degrees/data-science/teaching-staff/#:
Teaching Staff
Teaching Staff
en
Education
Programs
Bachelor's Degrees
Data Science
Teaching Staff
Teaching Staff
Teaching Staff
Apply here
Teaching Staff
Américo Rio
Invited Assistant Professor
americo.rio@novaims.unl.pt
Know more
Ana Cristina Costa
Associate Professor
cristina@novaims.unl.pt
Know more
Artur Varanda
Adjunct Lecturer
avaranda@novaims.unl.pt
Know more
Augusto Santos
Assistant Professor
ajrsantos@novaims.unl.pt
Know more
Bruno Damásio
Assistant Professor
bdamasio@novaims.unl.pt
Know more
Carina Albuquerque
Assistant Professor
calbuquerque@novaims.unl.pt
Know more
Carolina Maria Shaul
Adjunct Lecturer
cshaul@novaims.unl.pt
Know more
Carolina Santos Maximiano
Adjunct Lecturer
cmaximiano@novaims.unl.pt
Know more
Carolina Vasconcelos


In [5]:
cleaned_bs_teachingstaff = dp.clean_text_documents(
    data=bachelor_data_raw,
    doc_types_to_include=["teaching_staff"],
    words_to_remove=["know", "more", "apply", "here", "Education"], 
    words_to_deduplicate=["Teaching Staff"]
)

In [16]:
cleaned_bs_teachingstaff

{'bachelor_data-science_teaching-staff_text.txt': {'text': "Text from https://www.novaims.unl.pt/en//programs/bachelor-s-degrees/data-science/teaching-staff/#: Teaching Staff en Programs Bachelor's Degrees Data Science Teaching Staff Américo Rio Invited Assistant Professor americo.rio@novaims.unl.pt Ana Cristina Costa Associate Professor cristina@novaims.unl.pt Artur Varanda Adjunct Lecturer avaranda@novaims.unl.pt Augusto Santos Assistant Professor ajrsantos@novaims.unl.pt Bruno Damásio Assistant Professor bdamasio@novaims.unl.pt Carina Albuquerque Assistant Professor calbuquerque@novaims.unl.pt Carolina Maria Shaul Adjunct Lecturer cshaul@novaims.unl.pt Carolina Santos Maximiano Adjunct Lecturer cmaximiano@novaims.unl.pt Carolina Vasconcelos Invited Teaching Assistant cvasconcelos@novaims.unl.pt Catarina Neves Assistant Professor cneves@novaims.unl.pt Catarina Palha Invited Teaching Assistant cpalha@novaims.unl.pt Dhruv Akshay Pandit Invited Teaching Assistant dpandit@novaims.unl.pt 

In [6]:
dh.print_text_context_from_program_dicts(
    data=cleaned_bs_teachingstaff,
    course_names_to_include=["Data Science"],
    doc_types_to_include=["teaching_staff"]
)


--- Document: bachelor_data-science_teaching-staff_text.txt ---
Course Name: Data Science
Document Type: teaching_staff

Text Content:
Text from https://www.novaims.unl.pt/en//programs/bachelor-s-degrees/data-science/teaching-staff/#: Teaching Staff en Programs Bachelor's Degrees Data Science Teaching Staff Américo Rio Invited Assistant Professor americo.rio@novaims.unl.pt Ana Cristina Costa Associate Professor cristina@novaims.unl.pt Artur Varanda Adjunct Lecturer avaranda@novaims.unl.pt Augusto Santos Assistant Professor ajrsantos@novaims.unl.pt Bruno Damásio Assistant Professor bdamasio@novaims.unl.pt Carina Albuquerque Assistant Professor calbuquerque@novaims.unl.pt Carolina Maria Shaul Adjunct Lecturer cshaul@novaims.unl.pt Carolina Santos Maximiano Adjunct Lecturer cmaximiano@novaims.unl.pt Carolina Vasconcelos Invited Teaching Assistant cvasconcelos@novaims.unl.pt Catarina Neves Assistant Professor cneves@novaims.unl.pt Catarina Palha Invited Teaching Assistant cpalha@novaims.u

## Teaching staff stats after cleaning

In [7]:
du.histogram_token_count_multiple_docs(
    cleaned_bs_teachingstaff,
    doc_types_to_include=["teaching_staff"]
)

In [8]:
du.generate_document_statistics_by_word_count(
    cleaned_bs_teachingstaff,
    doc_types_to_include=["teaching_staff"]
)

In [9]:
du.generate_document_statistics_by_tokens(
    cleaned_bs_teachingstaff,
    doc_types_to_include=["teaching_staff"]
)

In [10]:
du.bar_plot_word_frequency(
    cleaned_bs_teachingstaff,
    doc_types_to_include=["teaching_staff"],
    top_n=20
)

In [12]:
du.bar_plot_ngram_frequency(
    cleaned_bs_teachingstaff,
    doc_types_to_include=["teaching_staff"],
    n=2, 
    top_n=20
)

# Analysing the results

In [13]:
dh.print_text_context_from_program_dicts(
    data=cleaned_bs_teachingstaff,
    course_names_to_include=["Data Science"],
    doc_types_to_include=["teaching_staff"]
)


--- Document: bachelor_data-science_teaching-staff_text.txt ---
Course Name: Data Science
Document Type: teaching_staff

Text Content:
Text from https://www.novaims.unl.pt/en//programs/bachelor-s-degrees/data-science/teaching-staff/#: Teaching Staff en Programs Bachelor's Degrees Data Science Teaching Staff Américo Rio Invited Assistant Professor americo.rio@novaims.unl.pt Ana Cristina Costa Associate Professor cristina@novaims.unl.pt Artur Varanda Adjunct Lecturer avaranda@novaims.unl.pt Augusto Santos Assistant Professor ajrsantos@novaims.unl.pt Bruno Damásio Assistant Professor bdamasio@novaims.unl.pt Carina Albuquerque Assistant Professor calbuquerque@novaims.unl.pt Carolina Maria Shaul Adjunct Lecturer cshaul@novaims.unl.pt Carolina Santos Maximiano Adjunct Lecturer cmaximiano@novaims.unl.pt Carolina Vasconcelos Invited Teaching Assistant cvasconcelos@novaims.unl.pt Catarina Neves Assistant Professor cneves@novaims.unl.pt Catarina Palha Invited Teaching Assistant cpalha@novaims.u

In [14]:
dh.print_text_context_from_program_dicts(
    data=cleaned_bs_teachingstaff,
    course_names_to_include=["Information Management"],
    doc_types_to_include=["teaching_staff"]
)


--- Document: bachelor_information-management_teaching-staff_text.txt ---
Course Name: Information Management
Document Type: teaching_staff

Text Content:
Text from https://www.novaims.unl.pt/en//programs/bachelor-s-degrees/information-management/teaching-staff/#: Teaching Staff en Programs Bachelor's Degrees Information Management Teaching Staff Ana Beatriz Domingues Farinha Adjunct Lecturer afarinha@novaims.unl.pt Ana Cristina Costa Associate Professor cristina@novaims.unl.pt André Barriguinha Professor of the Practice abarriguinha@novaims.unl.pt Augusto Santos Assistant Professor ajrsantos@novaims.unl.pt Bruno Damásio Assistant Professor bdamasio@novaims.unl.pt Bruno Rodrigues Adjunct Lecturer brodrigues@novaims.unl.pt Carlos Tam Professor of the Practice carlosvai@novaims.unl.pt Carolina Maria Shaul Adjunct Lecturer cshaul@novaims.unl.pt Carolina Vasconcelos Invited Teaching Assistant cvasconcelos@novaims.unl.pt Catarina Neves Assistant Professor cneves@novaims.unl.pt Catarina Oli

In [15]:
dh.print_text_context_from_program_dicts(
    data=cleaned_bs_teachingstaff,
    course_names_to_include=["Information Systems"],
    doc_types_to_include=["teaching_staff"]
)


--- Document: bachelor_information-systems_teaching-staff_text.txt ---
Course Name: Information Systems
Document Type: teaching_staff

Text Content:
Text from https://www.novaims.unl.pt/en//programs/bachelor-s-degrees/information-systems/teaching-staff/#: Teaching Staff en Programs Bachelor's Degrees Information Systems Teaching Staff Américo Rio Invited Assistant Professor americo.rio@novaims.unl.pt Ana Cristina Costa Associate Professor cristina@novaims.unl.pt André Barriguinha Professor of the Practice abarriguinha@novaims.unl.pt António Monteiro Invited Teaching Assistant amonteiro@novaims.unl.pt Augusto Santos Assistant Professor ajrsantos@novaims.unl.pt Bruno Rodrigues Adjunct Lecturer brodrigues@novaims.unl.pt Carlos Tam Professor of the Practice carlosvai@novaims.unl.pt Carolina Beatriz Lopes Silvestre Adjunct Lecturer csilvestre@novaims.unl.pt Constança Filipa Ferreira Rodrigues Adjunct Lecturer cfrodrigues@novaims.unl.pt Diogo Rasteiro Research Assistant drasteiro@novaims.un

# Saving the cleaned files in dictionaries

In [17]:
import pickle 
# Define output folder
output_folder = "../../data/Preprocessing_text/bachelors_data"

os.makedirs(output_folder, exist_ok=True)  # Ensure output folder exists

# Save each dictionary as a separate Pickle file
with open(os.path.join(output_folder, "dict_teachingstaff_cleaned.pkl"), "wb") as f:
    pickle.dump(cleaned_teachingstaff, f)

print("Pickle files saved successfully!")

Pickle files saved successfully!
