In [1]:
import os
# Move to Thesis directory (two levels up)
os.chdir(os.path.abspath(os.path.join("..", "..")))

# Move to model/src if it exists
model_dir = os.path.join(os.getcwd(), "model", "src")
if os.path.exists(model_dir):
    os.chdir(model_dir)

print("Current Directory:", os.getcwd())

Current Directory: c:\Users\1176153\Downloads\github\Thesis\model\src


In [2]:
import pandas as pd
import re
import libs.data_understanding as du
import libs.data_preparation as dp

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\1176153\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
dict_teachingstaff_raw = pd.read_pickle(r"../../data/Webscrapping/bachelor_dict_textfiles_raw/dict_teachingstaff_raw.pkl")


# Cleaning Teaching Staff files

In [4]:
dict_teachingstaff_raw.keys()

dict_keys(['data-science_teaching-staff_extracted_text.txt', 'information-management_teaching-staff_extracted_text.txt', 'information-systems_teaching-staff_extracted_text.txt'])

## Deleting words

In [9]:
# Print the full text of the first document
for filename, content in dict_teachingstaff_raw.items():
    print(f"\n--- {filename} ---")
    print(content)
    break


--- data-science_teaching-staff_extracted_text.txt ---
Text from https://www.novaims.unl.pt/en/education/programs/bachelor-s-degrees/data-science/teaching-staff/#:
Teaching Staff
Teaching Staff
en
Education
Programs
Bachelor's Degrees
Data Science
Teaching Staff
Teaching Staff
Teaching Staff
Apply here
Teaching Staff
Américo Rio
Invited Assistant Professor
americo.rio@novaims.unl.pt
Know more
Ana Cristina Costa
Associate Professor
cristina@novaims.unl.pt
Know more
Artur Varanda
Adjunct Lecturer
avaranda@novaims.unl.pt
Know more
Augusto Santos
Assistant Professor
ajrsantos@novaims.unl.pt
Know more
Bruno Damásio
Assistant Professor
bdamasio@novaims.unl.pt
Know more
Carina Albuquerque
Assistant Professor
calbuquerque@novaims.unl.pt
Know more
Carolina Maria Shaul
Adjunct Lecturer
cshaul@novaims.unl.pt
Know more
Carolina Santos Maximiano
Adjunct Lecturer
cmaximiano@novaims.unl.pt
Know more
Carolina Vasconcelos
Invited Teaching Assistant
cvasconcelos@novaims.unl.pt
Know more
Catarina Neves


In [10]:
cleaned_teachingstaff = dp.clean_text_documents(
    dict_teachingstaff_raw, 
    words_to_remove=["know", "more", "apply", "here", "Education"], 
    words_to_deduplicate=["Teaching Staff"]
)

In [11]:
# Print the full text of the first document
for filename, content in cleaned_teachingstaff.items():
    print(f"\n--- {filename} ---")
    print(content)
    break


--- data-science_teaching-staff_extracted_text.txt ---
Text from https://www.novaims.unl.pt/en//programs/bachelor-s-degrees/data-science/teaching-staff/#: Teaching Staff en Programs Bachelor's Degrees Data Science Teaching Staff Américo Rio Invited Assistant Professor americo.rio@novaims.unl.pt Ana Cristina Costa Associate Professor cristina@novaims.unl.pt Artur Varanda Adjunct Lecturer avaranda@novaims.unl.pt Augusto Santos Assistant Professor ajrsantos@novaims.unl.pt Bruno Damásio Assistant Professor bdamasio@novaims.unl.pt Carina Albuquerque Assistant Professor calbuquerque@novaims.unl.pt Carolina Maria Shaul Adjunct Lecturer cshaul@novaims.unl.pt Carolina Santos Maximiano Adjunct Lecturer cmaximiano@novaims.unl.pt Carolina Vasconcelos Invited Teaching Assistant cvasconcelos@novaims.unl.pt Catarina Neves Assistant Professor cneves@novaims.unl.pt Catarina Palha Invited Teaching Assistant cpalha@novaims.unl.pt Dhruv Akshay Pandit Invited Teaching Assistant dpandit@novaims.unl.pt Diog

## Teaching staff stats after cleaning

In [12]:
du.histogram_word_count_multiple_docs(cleaned_teachingstaff)
du.histogram_token_count_multiple_docs(cleaned_teachingstaff)
du.generate_document_statistics_by_word_count(cleaned_teachingstaff)
du.generate_document_statistics_by_tokens(cleaned_teachingstaff)
du.bar_plot_word_frequency(cleaned_teachingstaff, top_n=20)
du.bar_plot_ngram_frequency(cleaned_teachingstaff, n=2, top_n=20) 

# Analysing the results

In [13]:
target_filename = 'information-management_teaching-staff_extracted_text.txt'

if target_filename in cleaned_teachingstaff:
    print(f"\n--- {target_filename} ---\n")
    formatted_content = cleaned_teachingstaff[target_filename].replace('. ', '.\n').replace('? ', '?\n').replace('! ', '!\n')
    print(formatted_content)
else:
    print(f"Document with key '{target_filename}' not found.")


--- information-management_teaching-staff_extracted_text.txt ---

Text from https://www.novaims.unl.pt/en//programs/bachelor-s-degrees/information-management/teaching-staff/#: Teaching Staff en Programs Bachelor's Degrees Information Management Teaching Staff Ana Beatriz Domingues Farinha Adjunct Lecturer afarinha@novaims.unl.pt Ana Cristina Costa Associate Professor cristina@novaims.unl.pt André Barriguinha Professor of the Practice abarriguinha@novaims.unl.pt Augusto Santos Assistant Professor ajrsantos@novaims.unl.pt Bruno Damásio Assistant Professor bdamasio@novaims.unl.pt Bruno Rodrigues Adjunct Lecturer brodrigues@novaims.unl.pt Carlos Tam Professor of the Practice carlosvai@novaims.unl.pt Carolina Maria Shaul Adjunct Lecturer cshaul@novaims.unl.pt Carolina Vasconcelos Invited Teaching Assistant cvasconcelos@novaims.unl.pt Catarina Neves Assistant Professor cneves@novaims.unl.pt Catarina Oliveira Ferraz Invited Teaching Assistant cferraz@novaims.unl.pt Constança Filipa Ferreira 

In [15]:
target_filename = 'information-systems_teaching-staff_extracted_text.txt'

if target_filename in cleaned_teachingstaff:
    print(f"\n--- {target_filename} ---\n")
    formatted_content = cleaned_teachingstaff[target_filename].replace('. ', '.\n').replace('? ', '?\n').replace('! ', '!\n')
    print(formatted_content)
else:
    print(f"Document with key '{target_filename}' not found.")


--- information-systems_teaching-staff_extracted_text.txt ---

Text from https://www.novaims.unl.pt/en//programs/bachelor-s-degrees/information-systems/teaching-staff/#: Teaching Staff en Programs Bachelor's Degrees Information Systems Teaching Staff Américo Rio Invited Assistant Professor americo.rio@novaims.unl.pt Ana Cristina Costa Associate Professor cristina@novaims.unl.pt André Barriguinha Professor of the Practice abarriguinha@novaims.unl.pt António Monteiro Invited Teaching Assistant amonteiro@novaims.unl.pt Augusto Santos Assistant Professor ajrsantos@novaims.unl.pt Bruno Rodrigues Adjunct Lecturer brodrigues@novaims.unl.pt Carlos Tam Professor of the Practice carlosvai@novaims.unl.pt Carolina Beatriz Lopes Silvestre Adjunct Lecturer csilvestre@novaims.unl.pt Constança Filipa Ferreira Rodrigues Adjunct Lecturer cfrodrigues@novaims.unl.pt Diogo Rasteiro Research Assistant drasteiro@novaims.unl.pt Filipe Marques Professor of the Practice fjmarques@novaims.unl.pt Filipe Montargi

# Saving the cleaned files in dictionaries

In [16]:
cleaned_teachingstaff

{'data-science_teaching-staff_extracted_text.txt': "Text from https://www.novaims.unl.pt/en//programs/bachelor-s-degrees/data-science/teaching-staff/#: Teaching Staff en Programs Bachelor's Degrees Data Science Teaching Staff Américo Rio Invited Assistant Professor americo.rio@novaims.unl.pt Ana Cristina Costa Associate Professor cristina@novaims.unl.pt Artur Varanda Adjunct Lecturer avaranda@novaims.unl.pt Augusto Santos Assistant Professor ajrsantos@novaims.unl.pt Bruno Damásio Assistant Professor bdamasio@novaims.unl.pt Carina Albuquerque Assistant Professor calbuquerque@novaims.unl.pt Carolina Maria Shaul Adjunct Lecturer cshaul@novaims.unl.pt Carolina Santos Maximiano Adjunct Lecturer cmaximiano@novaims.unl.pt Carolina Vasconcelos Invited Teaching Assistant cvasconcelos@novaims.unl.pt Catarina Neves Assistant Professor cneves@novaims.unl.pt Catarina Palha Invited Teaching Assistant cpalha@novaims.unl.pt Dhruv Akshay Pandit Invited Teaching Assistant dpandit@novaims.unl.pt Diogo Ra

In [17]:
import pickle 
# Define output folder
output_folder = "../../data/Preprocessing_text/bachelors_data"

os.makedirs(output_folder, exist_ok=True)  # Ensure output folder exists

# Save each dictionary as a separate Pickle file
with open(os.path.join(output_folder, "dict_teachingstaff_cleaned.pkl"), "wb") as f:
    pickle.dump(cleaned_teachingstaff, f)

print("Pickle files saved successfully!")

Pickle files saved successfully!
