In [1]:
import pandas as pd
import os


In [19]:
TRANSFORMED_DATA_DIR = "../../data/silver"
REPORT_DATA_DIR = "../../data/platinum"
os.makedirs(REPORT_DATA_DIR, exist_ok=True)

In [20]:

tables = [
    "User", "Designation", "Course", "EmployeeProgress", 
    "Skill", "UserSkill", "DesignationSkill", "CourseSkill"
]

transformed_dfs = {}

for table in tables:
    file_path = os.path.join(TRANSFORMED_DATA_DIR, f'prep_{table}.csv')
    transformed_dfs[table] = pd.read_csv(file_path)


In [21]:
user_designation = pd.merge(transformed_dfs["User"], transformed_dfs["Designation"], 
                             left_on="designationId", right_on="id", 
                             suffixes=("", "_designation"))

# Merge with EmployeeProgress
user_progress = pd.merge(user_designation, transformed_dfs["EmployeeProgress"], 
                         left_on="id", right_on="userId", 
                         suffixes=("", "_progress"))

# Merge with Course
course_progress = pd.merge(user_progress, transformed_dfs["Course"], 
                           left_on="courseId", right_on="course_id", 
                           suffixes=("", "_course"))

# Aggregate user skills with skill names
user_skills = (transformed_dfs["UserSkill"]
               .merge(transformed_dfs["Skill"], left_on="skillId", right_on="id")
               .groupby("userId")["name"]
               .apply(list)
               .reset_index(name="userSkills"))

# Merge user skills
final_data = pd.merge(course_progress, user_skills, 
                      left_on="id", right_on="userId", 
                      how="left")

# Aggregate course skills with skill names
course_skills = (transformed_dfs["CourseSkill"]
                 .merge(transformed_dfs["Skill"], left_on="skillId", right_on="id")
                 .groupby("courseId")["name"]
                 .apply(list)
                 .reset_index(name="courseSkills"))

# Merge course skills
final_data = pd.merge(final_data, course_skills, 
                      left_on="courseId", right_on="courseId", 
                      how="left")

# Aggregate designation skills with skill names
designation_skills = (transformed_dfs["DesignationSkill"]
                      .merge(transformed_dfs["Skill"], left_on="skillId", right_on="id")
                      .groupby("designationId")["name"]
                      .apply(list)
                      .reset_index(name="designationSkills"))

# Merge designation skills
final_data = pd.merge(final_data, designation_skills, 
                      left_on="designationId", right_on="designationId", 
                      how="left")

In [22]:
final_data.columns

Index(['id', 'username', 'designationId', 'createdAt', 'id_designation',
       'name', 'userId_x', 'courseId', 'progressStatus', 'last_updated',
       'modulesCompleted', 'has_certificate', 'course_id', 'title',
       'difficulty', 'language', 'totalTime', 'totalModules', 'created_at',
       'userId_y', 'userSkills', 'courseSkills', 'designationSkills'],
      dtype='object')

In [23]:
final_columns = [
    "id", "courseId", "username", "designationId", "name", 
    "title", "difficulty", "language", 
    "totalTime", "totalModules", "progressStatus", 
    "last_updated", "modulesCompleted", 
    "userSkills", "courseSkills", "designationSkills"
]

final_data = final_data[final_columns]
final_data.rename(columns={
    "name": "designationName",
    "title": "courseName",
    "username": "employeeName",
    "last_updated": "lastUpdated",
    "id": "employeeId",

}, inplace=True)

final_data.head()

Unnamed: 0,employeeId,courseId,employeeName,designationId,designationName,courseName,difficulty,language,totalTime,totalModules,progressStatus,lastUpdated,modulesCompleted,userSkills,courseSkills,designationSkills
0,2,331,Allison Chan,5,IT Specialist,Programmable even-keeled process improvement,Intermediate,English,480,10,in_progress,2024-01-18 01:59:25.000,6,"[DH+, Life Sciences, HP Server Hardware, MBAL,...","[MBAL, Press Releases, FCoE, Product Development]","[Network Architecture, SQF, SRDS, System Testi..."
1,2,331,Allison Chan,5,IT Specialist,Programmable even-keeled process improvement,Intermediate,English,480,10,completed,2024-07-30 06:51:29.000,10,"[DH+, Life Sciences, HP Server Hardware, MBAL,...","[MBAL, Press Releases, FCoE, Product Development]","[Network Architecture, SQF, SRDS, System Testi..."
2,2,119,Allison Chan,5,IT Specialist,Secured interactive product,Beginner,Tamil,273,8,in_progress,2024-07-18 11:08:39.000,6,"[DH+, Life Sciences, HP Server Hardware, MBAL,...","[HP Procurve, PCB Design, E-on Vue]","[Network Architecture, SQF, SRDS, System Testi..."
3,2,119,Allison Chan,5,IT Specialist,Secured interactive product,Beginner,Tamil,273,8,completed,2024-08-17 06:29:20.000,8,"[DH+, Life Sciences, HP Server Hardware, MBAL,...","[HP Procurve, PCB Design, E-on Vue]","[Network Architecture, SQF, SRDS, System Testi..."
4,3,367,Scott Flores,7,Project Manager,Up-sized multimedia challenge,Advanced,German,79,10,in_progress,2023-10-19 02:23:32.000,9,"[Rhino 3D, Winning Others Over, Military Logis...","[Winning Others Over, Military Logistics, SS7,...","[Karaoke, SMO, Educational Leadership, Tax Acc..."


In [24]:
final_data.to_csv(os.path.join(REPORT_DATA_DIR, 'report_data.csv'), index=False)