In [1]:
import pandas as pd
import os

In [2]:
TRANSFORMED_DATA_DIR = "../../data/silver"  # Path to the transformed data
GOLD_DATA_DIR = "../../data/gold"  # Path to save gold layer data
os.makedirs(GOLD_DATA_DIR, exist_ok=True) 

In [3]:
# Cell 2: Load transformed data
tables = [
    "User", "Designation", "Course", "EmployeeProgress", 
    "Skill", "UserSkill", "DesignationSkill", "CourseSkill"
]

# Dictionary to hold DataFrames for each table
transformed_dfs = {}

for table in tables:
    file_path = os.path.join(TRANSFORMED_DATA_DIR, f'transformed_{table}.csv')
    transformed_dfs[table] = pd.read_csv(file_path)

In [4]:
# Cell 3: Create dimension tables
dim_users = transformed_dfs["User"]
dim_designations = transformed_dfs["Designation"]
dim_courses = transformed_dfs["Course"]
dim_skills = transformed_dfs["Skill"]

# Save dimension tables
dim_users.to_csv(os.path.join(GOLD_DATA_DIR, 'dim_users.csv'), index=False)
dim_designations.to_csv(os.path.join(GOLD_DATA_DIR, 'dim_designations.csv'), index=False)
dim_courses.to_csv(os.path.join(GOLD_DATA_DIR, 'dim_courses.csv'), index=False)
dim_skills.to_csv(os.path.join(GOLD_DATA_DIR, 'dim_skills.csv'), index=False)

print("Dimension tables created and saved.")

Dimension tables created and saved.


In [5]:
# Cell 4: Create fact table
# Fact table: EmployeeProgress which relates users and courses
fact_employee_progress = transformed_dfs["EmployeeProgress"]

# Join with dimension tables to add descriptive names (if needed)
fact_employee_progress = fact_employee_progress.merge(dim_users[['id', 'username']], left_on='userId', right_on='id', how='left', suffixes=('', '_user'))
fact_employee_progress = fact_employee_progress.merge(dim_courses[['course_id', 'title']], left_on='courseId', right_on='course_id', how='left', suffixes=('', '_course'))

# Select relevant columns for the fact table
fact_employee_progress = fact_employee_progress[['userId', 'courseId', 'progressStatus', 'last_updated', 'modulesCompleted', 'username', 'title']]

# Save fact table
fact_employee_progress.to_csv(os.path.join(GOLD_DATA_DIR, 'fact_employee_progress.csv'), index=False)

print("Fact table created and saved.")


Fact table created and saved.
