# Data Aggregation


In [None]:
import pandas as pd

import sys
import os

## Aggregate survey results - English Cohorts

In [None]:
#
# aggregate all results from English cohorts in a single dataframe
#

path_results_en = "../private_data/data/survey_results_en/"

cohorts_en = [
    "2021.09",
    "2022.01",
    "2022.03",
    "2022.06",
    "2022.09",
    "2022.11",
    "2023.02",
    "2023.05",
    "2023.07",
    "2023.10",
    "2024.01",
    "2024.04",
    "2024.07",
    "2024.09",
    "2025.01",
]

dataframes_en = []

for cohort in cohorts_en:
    new_df = pd.read_excel(f"{path_results_en}/results_{cohort}.xlsx")
    dataframes_en.append(new_df)

merged_df_en = pd.concat(dataframes_en, ignore_index=True)

display(f"Number of entries: {len(merged_df_en) - 1}")


#
# Rename columns
#

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "../scripts/aggregation/")))
from rename_columns_en import rename_columns_en

merged_df_en = rename_columns_en(merged_df_en)



#
# Map names to ct_student_id + remove names + remove comments
#
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "../private_data/dictionaries_student_data/")))
from dictionary_students_en import dictionary_students_en

merged_df_en.insert(2, "ct_student_id", None) # add column "ct_student_id"
for student_name, cohort_id in dictionary_students_en.items():
    merged_df_en.loc[merged_df_en['name'].str.strip() == student_name.strip(), "ct_student_id"] = cohort_id

#
# Adding "cohort_language" column with "EN" value for English cohorts
#
merged_df_en.insert(4, "cohort_language", "EN")


#
# Remove columns that may contain personal data
#

merged_df_en.drop("name", axis=1, inplace=True)
merged_df_en.drop("final_comments", axis=1, inplace=True)
merged_df_en.drop("Marca temporal", axis=1, inplace=True)


#
# test
#
# pd.set_option('display.max_rows', None)
#display(merged_df_en)

## Join results from English + Spanish cohorts

In [None]:
es_data_translated = pd.read_excel("../private_data/data/survey_results_es/es_data_translated.xlsx")

In [None]:
# NOTE: add cohort language if needed here
merged_df_en_es = pd.concat([es_data_translated, merged_df_en])

In [None]:
merged_df_en_es.to_excel("../private_data/data/survey_results_aggregated/data.xlsx", index=False)