In [4]:
import os
# Move to Thesis directory (two levels up)
os.chdir(os.path.abspath(os.path.join("..", "..")))

# Move to model/src if it exists
model_dir = os.path.join(os.getcwd(), "model", "src")
if os.path.exists(model_dir):
    os.chdir(model_dir)

print("Current Directory:", os.getcwd())

Current Directory: c:\Users\1176153\Downloads\github\Thesis\model\src


In [26]:
import os
import re

def normalize_bachelor_filenames(folder_path: str):
    for filename in os.listdir(folder_path):
        if not filename.endswith(".txt"):
            continue

        original_path = os.path.join(folder_path, filename)
        new_filename = filename

        # 1. Replace "program" with "studyplan"
        new_filename = re.sub(r"program", "studyplan", new_filename, flags=re.IGNORECASE)

        # 2. Remove "_extracted" or similar
        new_filename = re.sub(r"_extracted", "", new_filename, flags=re.IGNORECASE)

        # 3. Add "bachelor_" at the beginning if not already present
        if not new_filename.lower().startswith("bachelor_"):
            new_filename = "bachelor_" + new_filename

        new_path = os.path.join(folder_path, new_filename)

        # Only rename if the name actually changed
        if new_path != original_path:
            print(f"Renaming:\n  {filename} → {new_filename}")
            os.rename(original_path, new_path)


In [27]:
folder_path = r"../../data/Webscrapping/bachelor_degree/"
normalize_bachelor_filenames(folder_path)


Renaming:
  data-science_main_course_extracted_text.txt → bachelor_data-science_main_course_text.txt
Renaming:
  data-science_program_extracted_text.txt → bachelor_data-science_studyplan_text.txt
Renaming:
  data-science_teaching-staff_extracted_text.txt → bachelor_data-science_teaching-staff_text.txt
Renaming:
  information-management_main_course_extracted_text.txt → bachelor_information-management_main_course_text.txt
Renaming:
  information-management_program_extracted_text.txt → bachelor_information-management_studyplan_text.txt
Renaming:
  information-management_teaching-staff_extracted_text.txt → bachelor_information-management_teaching-staff_text.txt
Renaming:
  information-systems_main_course_extracted_text.txt → bachelor_information-systems_main_course_text.txt
Renaming:
  information-systems_program_extracted_text.txt → bachelor_information-systems_studyplan_text.txt
Renaming:
  information-systems_teaching-staff_extracted_text.txt → bachelor_information-systems_teaching-staf

In [2]:
import os
from typing import Dict

def load_texts_with_metadata(folder_path: str) -> Dict[str, Dict]:
    docs_input = {}

    for filename in os.listdir(folder_path):
        if not filename.endswith(".txt"):
            continue

        filepath = os.path.join(folder_path, filename)

        with open(filepath, "r", encoding="utf-8") as f:
            text = f.read().strip()

        filename_lower = filename.lower()

        # Infer degree from filename
        if "postgraduate" in filename_lower:
            degree = "postgraduate"
        elif "master" in filename_lower:
            degree = "masters"
        elif "bachelor" in filename_lower:
            degree = "bachelor"
        else:
            degree = "unknown"

        # Infer doc_type from filename
        if "teachingstaff" in filename_lower or "teaching-staff" in filename_lower or "faculty" in filename_lower:
            doc_type = "teaching_staff"
        elif "study plan" in filename_lower or "study_plan" in filename_lower or "studyplan" in filename_lower:
            doc_type = "study_plan"
        elif "maininfo" in filename_lower or "main_info" in filename_lower or "main_course" in filename_lower:
            doc_type = "main_info"
        else:
            doc_type = "unknown"

        # Extract course name from filename
        # Remove leading "bachelor_", "master_", etc., and trailing doc_type keywords
        course_part = filename_lower.replace(".txt", "")

        for prefix in ["bachelor_", "postgraduate_", "master_"]:
            if course_part.startswith(prefix):
                course_part = course_part[len(prefix):]

        for suffix in ["_teachingstaff", "_teaching-staff", "_faculty", "_study_plan", "_studyplan", "_study plan", "_maininfo", "_main_info", "_main_course", "_text"]:
            course_part = course_part.replace(suffix, "")

        course_name = course_part.replace("-", " ").replace("_", " ").strip().title()

        docs_input[filename] = {
            "text": text,
            "metadata": {
                "degree": degree,
                "doc_type": doc_type,
                "course_name": course_name
            }
        }

    return docs_input




In [3]:
postgrad_teachingstaff_path = r"../../data/Webscrapping/postgraduate_master_degrees/teachingstaff"
teachingstaff_docs_input = load_texts_with_metadata(postgrad_teachingstaff_path)

In [4]:
teachingstaff_docs_input

{'european-master-of-science-in-information-systems-management_Faculty.txt': {'text': 'Faculty Faculty Apply here Faculty Afshin Ashofteh Assistant Professor aashofteh@novaims.unl.pt Know more Américo Rio Invited Assistant Professor americo.rio@novaims.unl.pt Know more Ana Cristina Costa Associate Professor cristina@novaims.unl.pt Know more Ana Gonçalves Research Assistant agoncalves@novaims.unl.pt Know more André Barriguinha Professor of the Practice abarriguinha@novaims.unl.pt Know more António Monteiro Invited Teaching Assistant amonteiro@novaims.unl.pt Know more Augusto Santos Assistant Professor ajrsantos@novaims.unl.pt Know more Bernardo Dias Raimundo Adjunct Lecturer braimundo@novaims.unl.pt Know more Bruno Damásio Assistant Professor bdamasio@novaims.unl.pt Know more Bruno Jardim Assistant Professor bjardim@novaims.unl.pt Know more Bruno Rodrigues Adjunct Lecturer brodrigues@novaims.unl.pt Know more Carlos Tam Professor of the Practice carlosvai@novaims.unl.pt Know more Catarin

In [32]:
postgrad_studyplan_path = r"../../data/Webscrapping/postgraduate_master_degrees/studyplan"
studyplan_docs_input = load_texts_with_metadata(postgrad_studyplan_path)

In [33]:
studyplan_docs_input

{'european-master-of-science-in-information-systems-management_Study plan.txt': {'text': 'Study plan 1 st Year - 1 st Semester (Fall) - NOVA IMS Curricular Units ECTS Data Management and Storage 4 Data Privacy, Security and Ethics 4 Descriptive Methods of Data Mining 7,5 Information Systems Development 4 Information Systems Governance 3,5 Information Technologies Services Management 4 Management of Information Systems 3,5 Course Unit Loading... modal item card item 1 st Year - 2 nd Semester (Spring) - SEB LU Curricular Units ECTS Business Intelligence and Analytics 7 Business Skills Development 1 4 Digital Business 7 Strategic Management 2 7 Course Unit Loading... modal item card item 2 nd Year - 3 rd Semester (Fall) - SEB LU Curricular Units ECTS Accounting Information for Decision Making 7 Business Process Management 7 IT Project Management or Information Systems Analysis and Design 7 Organization and Management 7 Research Methods and Techniques 7 Course Unit Loading... modal item ca

In [34]:
postgrad_maininfo_path = r"../../data/Webscrapping/postgraduate_master_degrees/maininfo"
maininfo_docs_input = load_texts_with_metadata(postgrad_maininfo_path)

In [35]:
maininfo_docs_input

{'european-master-of-science-in-information-systems-management_main_course.txt': {'text': "European Master of Science in Information Systems Management\nen\nEducation\nPrograms\nPostgraduate Programs and Master Degree Programs\nEuropean Master of Science in Information Systems Management\nThe European Master of Science in Information Systems Management aims at graduates who are willing to\nacquire additional information systems management skills\nto be applied in their professional field and intend to have a master's double degree diploma from two universities:\nM.Sc. in Information Management, with a specialization in Information Systems Management\n,\nby NOVA Information Management School of Universidade NOVA de Lisboa, Portugal\n- Ranked by Eduniversal as the best\xa0 Master Program in Information Systems Management in\nWestern Europe\n.\nM.Sc. in Business Informatics by School of Economics and Business\xa0(SEB LU), of Ljubljana University, Slovenia\n- Accredited by AACBS, the Assoc

In [43]:
import os
import pickle
from typing import Dict, List

def save_dict_to_pickle(data_dict: Dict[str, Dict], output_file_name: str, output_folder: str):
    os.makedirs(output_folder, exist_ok=True)  # Ensure the folder exists
    output_path = os.path.join(output_folder, output_file_name)

    with open(output_path, "wb") as f:
        pickle.dump(data_dict, f)

    print(f"✅ Saved: {output_path}")

def process_multiple_folders(folder_paths: List[str]):
    all_bachelors = {}
    all_postgrad_and_masters = {}

    for folder_path in folder_paths:
        folder_docs = load_texts_with_metadata(folder_path)

        for filename, content in folder_docs.items():
            degree = content["metadata"]["degree"]

            if degree == "bachelor":
                all_bachelors[filename] = content
            elif degree in {"postgraduate", "masters"}:
                all_postgrad_and_masters[filename] = content
            else:
                print(f"⚠️ Skipping unknown degree in file: {filename}")

    save_dict_to_pickle(all_bachelors, "dict_bachelors_raw.pkl", r"C:\Users\1176153\Downloads\github\Thesis\data\Preprocessing_text\all_programs_textfiles_raw")
    save_dict_to_pickle(all_postgrad_and_masters, "dict_postgrad_and_masters_raw.pkl", r"C:\Users\1176153\Downloads\github\Thesis\data\Preprocessing_text\all_programs_textfiles_raw")

In [44]:
# Example usage:
folder_paths = [
    r"../../data/Webscrapping/bachelor_degree/",
    r"../../data/Webscrapping/postgraduate_master_degrees/teachingstaff",
    r"../../data/Webscrapping/postgraduate_master_degrees/studyplan",
    r"../../data/Webscrapping/postgraduate_master_degrees/maininfo",
]

process_multiple_folders(folder_paths)

✅ Saved: C:\Users\1176153\Downloads\github\Thesis\data\Preprocessing_text\all_programs_textfiles_raw\dict_bachelors_raw.pkl
✅ Saved: C:\Users\1176153\Downloads\github\Thesis\data\Preprocessing_text\all_programs_textfiles_raw\dict_postgrad_and_masters_raw.pkl


In [47]:
from libs import data_handeling as dh

In [48]:
bachelors_data_raw = dh.load_pickle_to_dict(r"C:\Users\1176153\Downloads\github\Thesis\data\Preprocessing_text\all_programs_textfiles_raw\dict_bachelors_raw.pkl")
bachelors_data_raw

{'bachelor_data-science_main_course_text.txt': {'text': "Text from https://www.novaims.unl.pt/en/education/programs/bachelor-s-degrees/data-science/:\nData Science\nDegree in\nData Science\nen\nEducation\nPrograms\nBachelor's Degrees\nData Science\nIn the Bachelor´s Degree in Data Science, students learn the most modern techniques of artificial intelligence and machine learning to analyze large volumes of data (Big Data).\nThey will become true data scientists - considered the sexiest profession of the 21\nst\ncentury by the Harvard Business Review.\nThe main objective of this course is to train future professionals capable of understanding, developing and using models, algorithms and the most advanced techniques in data science, to analyze and extract knowledge from Big Data.\nThe 3\nrd\nphase of applications under the International Student Statute for the 2025/26 academic year are open from February 26\nth\nto March 27\nth\n, 2025.\nDuration\n3 years (6 semesters)\nTimetable\nDaytime

# Testing data_handeling.py for the creationg of the dicts_programs_textfiles_raw

In [5]:
from libs import data_handeling as dh
dh.create_dict_programs_raw()

✅ Saved: ..\..\data\Preprocessing_text\all_programs_textfiles_raw\dict_bachelors_raw.pkl
✅ Saved: ..\..\data\Preprocessing_text\all_programs_textfiles_raw\dict_postgrad_and_masters_raw.pkl


In [6]:
bachelors_data_raw = dh.load_pickle_to_dict(r"C:\Users\1176153\Downloads\github\Thesis\data\Preprocessing_text\all_programs_textfiles_raw\dict_bachelors_raw.pkl")
bachelors_data_raw

{'bachelor_data-science_main_course_text.txt': {'text': "<Introducion>\nIn the Bachelor´s Degree in Data Science, students learn the most modern techniques of artificial intelligence and machine learning to analyze large volumes of data (Big Data).\nThey will become true data scientists - considered the sexiest profession of the 21\nst\ncentury by the Harvard Business Review.\nThe main objective of this course is to train future professionals capable of understanding, developing and using models, algorithms and the most advanced techniques in data science, to analyze and extract knowledge from Big Data.\nThe 3\nrd\nphase of applications under the International Student Statute for the 2025/26 academic year are open from February 26\nth\nto March 27\nth\n, 2025.\nDuration\n3 years (6 semesters)\nTimetable\nDaytime\nStart\nSeptember 2025\nCareer Opportunities\nThe Bachelor´s Degree in Data Science allows a quick integration in the most varied sectors of activity, namely: Information Techn

In [7]:
postgradmasters_data_raw = dh.load_pickle_to_dict(r"C:\Users\1176153\Downloads\github\Thesis\data\Preprocessing_text\all_programs_textfiles_raw\dict_postgrad_and_masters_raw.pkl")
postgradmasters_data_raw

{'european-master-of-science-in-information-systems-management_Faculty.txt': {'text': 'Faculty Faculty Apply here Faculty Afshin Ashofteh Assistant Professor aashofteh@novaims.unl.pt Know more Américo Rio Invited Assistant Professor americo.rio@novaims.unl.pt Know more Ana Cristina Costa Associate Professor cristina@novaims.unl.pt Know more Ana Gonçalves Research Assistant agoncalves@novaims.unl.pt Know more André Barriguinha Professor of the Practice abarriguinha@novaims.unl.pt Know more António Monteiro Invited Teaching Assistant amonteiro@novaims.unl.pt Know more Augusto Santos Assistant Professor ajrsantos@novaims.unl.pt Know more Bernardo Dias Raimundo Adjunct Lecturer braimundo@novaims.unl.pt Know more Bruno Damásio Assistant Professor bdamasio@novaims.unl.pt Know more Bruno Jardim Assistant Professor bjardim@novaims.unl.pt Know more Bruno Rodrigues Adjunct Lecturer brodrigues@novaims.unl.pt Know more Carlos Tam Professor of the Practice carlosvai@novaims.unl.pt Know more Catarin