In [1]:
import os
# Move to Thesis directory (two levels up)
os.chdir(os.path.abspath(os.path.join("..", "..")))

# Move to model/src if it exists
model_dir = os.path.join(os.getcwd(), "model", "src")
if os.path.exists(model_dir):
    os.chdir(model_dir)

print("Current Directory:", os.getcwd())

Current Directory: c:\Users\1176153\Downloads\github\Thesis\model\src


In [41]:
import pandas as pd
import re
import libs.data_understanding as du
import libs.data_preparation as dp

In [3]:
dict_teachingstaff_raw = pd.read_pickle(r"../../data/Webscrapping/dict_textfiles_raw/dict_teachingstaff_raw.pkl")
dict_studyplan_raw = pd.read_pickle(r"../../data/Webscrapping/dict_textfiles_raw/dict_studyplan_raw.pkl")
dict_maininfo_raw = pd.read_pickle(r"../../data/Webscrapping/dict_textfiles_raw/dict_maininfo_raw.pkl")

# Cleaning Teaching Staff files

In [6]:
dict_teachingstaff_raw.keys()

dict_keys(['european-master-of-science-in-information-systems-management_Faculty.txt', 'master-degree-in-data-driven-marketing-with-a-specialization-in-data-science-for-marketing-working-hours-format_Faculty.txt', 'master-degree-in-data-driven-marketing-with-a-specialization-in-digital-marketing-and-analytics-working-hours-format_Faculty.txt', 'master-degree-in-data-driven-marketing-with-a-specialization-in-marketing-intelligence-working-hours-format_Faculty.txt', 'master-degree-in-data-driven-marketing-with-a-specialization-in-marketing-research-and-crm_Faculty.txt', 'master-degree-in-geographic-information-systems-and-science-with-a-specialization-in-geographic-information-systems-and-science_Faculty.txt', 'master-degree-in-geographic-information-systems-and-science-with-a-specialization-in-geospatial-data-science_Faculty.txt', 'master-degree-in-geospatial-technologies_Faculty.txt', 'master-degree-in-information-management-with-a-specialization-in-digital-transformation_Faculty.txt',

## Deleting words

In [35]:
import re

def clean_text_documents(text_data, words_to_remove, words_to_deduplicate):
    """
    Cleans text documents by:
    1. Removing specified words completely.
    2. Ensuring only a single occurrence remains for repetitive words.
    
    Parameters:
    - text_data (dict): Dictionary where keys are filenames and values are document contents.
    - words_to_remove (list): List of words to remove entirely (case-insensitive).
    - words_to_deduplicate (list): List of words where only one occurrence should remain (case-insensitive).
    
    Returns:
    - dict: A dictionary with cleaned text.
    """
    cleaned_text_data = {}
    
    # Patterns
    remove_pattern = r'\b(?:' + '|'.join(re.escape(word) for word in words_to_remove) + r')\b' if words_to_remove else None
    deduplicate_pattern = r'\b(\w+)(?:\s+\1)+\b' if words_to_deduplicate else None

    for filename, text in text_data.items():
        cleaned_text = text

        # Remove words completely
        if remove_pattern:
            cleaned_text = re.sub(remove_pattern, '', cleaned_text, flags=re.IGNORECASE)
            cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()  # Clean extra spaces

        # Remove duplicate occurrences of specific words
        if deduplicate_pattern:
            for word in words_to_deduplicate:
                word_pattern = rf'\b{re.escape(word)}(?:\s+{re.escape(word)})+\b'
                cleaned_text = re.sub(word_pattern, word, cleaned_text, flags=re.IGNORECASE)

        cleaned_text_data[filename] = cleaned_text.strip()

    return cleaned_text_data






In [42]:
cleaned_teachingstaff = dp.clean_text_documents(
    dict_teachingstaff_raw, 
    words_to_remove=["know", "more", "apply", "here"], 
    words_to_deduplicate=["faculty"]
)

In [43]:
# Print an example document to check
for filename, content in cleaned_teachingstaff.items():
    print(f"\n--- {filename} ---")
    print(content[:500])  
    break  



--- european-master-of-science-in-information-systems-management_Faculty.txt ---
faculty Afshin Ashofteh Assistant Professor aashofteh@novaims.unl.pt Américo Rio Invited Assistant Professor americo.rio@novaims.unl.pt Ana Cristina Costa Associate Professor cristina@novaims.unl.pt Ana Gonçalves Research Assistant agoncalves@novaims.unl.pt André Barriguinha Professor of the Practice abarriguinha@novaims.unl.pt António Monteiro Invited Teaching Assistant amonteiro@novaims.unl.pt Augusto Santos Assistant Professor ajrsantos@novaims.unl.pt Bernardo Dias Raimundo Adjunct Lecturer b


## Teaching staff stats after cleaning

In [40]:
du.histogram_word_count_multiple_docs(cleaned_teachingstaff)
du.histogram_token_count_multiple_docs(cleaned_teachingstaff)
du.generate_document_statistics_by_word_count(cleaned_teachingstaff)
du.generate_document_statistics_by_tokens(cleaned_teachingstaff)
du.bar_plot_word_frequency(cleaned_teachingstaff, top_n=20)
du.bar_plot_ngram_frequency(cleaned_teachingstaff, n=2, top_n=20) 