In [1]:
import pandas as pd
import os


In [2]:
TRANSFORMED_DATA_DIR = "../../data/silver"

tables = [
    "User", "Designation", "Course", "EmployeeProgress", 
    "Skill", "UserSkill", "DesignationSkill", "CourseSkill"
]

transformed_dfs = {}

for table in tables:
    file_path = os.path.join(TRANSFORMED_DATA_DIR, f'prep_{table}.csv')
    transformed_dfs[table] = pd.read_csv(file_path)


In [5]:
# Merge User with EmployeeProgress
report_df = transformed_dfs["User"].merge(
    transformed_dfs["EmployeeProgress"],
    left_on="id",
    right_on="userId",
    how="left"
)

# Merge with Designation
report_df = report_df.merge(
    transformed_dfs["Designation"],
    left_on="designationId",
    right_on="id",
    how="left",
    suffixes=('', '_designation')
)

# Merge with Course
report_df = report_df.merge(
    transformed_dfs["Course"],
    left_on="courseId",
    right_on="course_id",
    how="left",
    suffixes=('', '_course')
)

# Merge with UserSkill
report_df = report_df.merge(
    transformed_dfs["UserSkill"],
    left_on="id",
    right_on="userId",
    how="left",
    suffixes=('', '_userSkill')
)

# Merge with Skill
report_df = report_df.merge(
    transformed_dfs["Skill"],
    left_on="skillId",
    right_on="id",
    how="left",
    suffixes=('', '_skill')
)

# Merge with DesignationSkill
report_df = report_df.merge(
    transformed_dfs["DesignationSkill"],
    left_on="designationId",
    right_on="designationId",
    how="left",
    suffixes=('', '_designationSkill')
)

# Merge with CourseSkill
report_df = report_df.merge(
    transformed_dfs["CourseSkill"],
    left_on="courseId",
    right_on="courseId",
    how="left",
    suffixes=('', '_courseSkill')
)
report_df.columns

Index(['id', 'username', 'designationId', 'createdAt', 'userId', 'courseId',
       'progressStatus', 'last_updated', 'modulesCompleted', 'has_certificate',
       'id_designation', 'name', 'course_id', 'title', 'difficulty',
       'language', 'totalTime', 'totalModules', 'created_at',
       'userId_userSkill', 'skillId', 'level', 'id_skill', 'name_skill',
       'skillId_designationSkill', 'skillId_courseSkill', 'level_courseSkill'],
      dtype='object')

In [78]:
rename_mapping = {
    'id': 'user_id',
    'username': 'username',
    'designationId': 'designation_id',
    'createdAt': 'created_at',
    'userId': 'user_id_progress',
    'courseId': 'course_id_progress',
    'progressStatus': 'progress_status',
    'last_updated': 'last_updated',
    'modulesCompleted': 'modules_completed',
    'has_certificate': 'has_certificate',
    'id_designation': 'id_designation',
    'name': 'designation_name',
    'course_id': 'course_id',
    'title': 'course_title',
    'difficulty': 'course_difficulty',
    'language': 'course_language',
    'totalTime': 'course_total_time',
    'totalModules': 'course_total_modules',
    'created_at': 'course_created_at',
    'userId_userSkill': 'user_id_skill',
    'skillId': 'skill_id',
    'level': 'skill_level',
    'id_skill': 'skill_id',
    'name_skill': 'skill_name',
    'skillId_designationSkill': 'skill_id_designation',
    'skillId_courseSkill': 'skill_id_course',
    'level_courseSkill': 'course_skill_level'
}


In [80]:
report_df.rename(columns=rename_mapping, inplace=True)
report_df.columns

Index(['user_id', 'username', 'designation_id', 'course_created_at',
       'user_id_progress', 'course_id_progress', 'progress_status',
       'last_updated', 'modules_completed', 'has_certificate',
       'designation_id', 'designation_name', 'course_id', 'course_title',
       'course_difficulty', 'course_language', 'course_total_time',
       'course_total_modules', 'course_created_at', 'user_id_skill',
       'skill_id', 'skill_level', 'skill_id', 'skill_name',
       'skill_id_designation', 'skill_id_course', 'course_skill_level'],
      dtype='object')

In [84]:
report_df.rename(columns=rename_mapping, inplace=True)
report_df.head()
report_df_columns = [
     'username',  'progress_status',
    'last_updated', 'modules_completed', 
    'designation_name', 'course_title',
    'course_difficulty', 'course_language', 
    'course_total_time', 'course_total_modules', 
    'skill_level', 'skill_name',
    'course_skill_level'
]


# Filter the final report to keep only the defined columns
report_df_report = report_df[report_df_columns]

In [85]:
report_df_report.columns

Index(['username', 'progress_status', 'last_updated', 'modules_completed',
       'designation_name', 'course_title', 'course_difficulty',
       'course_language', 'course_total_time', 'course_total_modules',
       'skill_level', 'skill_name', 'course_skill_level'],
      dtype='object')

In [89]:
report_df_report.head(100)

Unnamed: 0,username,progress_status,last_updated,modules_completed,designation_name,course_title,course_difficulty,course_language,course_total_time,course_total_modules,skill_level,skill_name,course_skill_level
0,Allison Chan,in_progress,2024-01-18 01:59:25.000,6.0,IT Specialist,Programmable even-keeled process improvement,Intermediate,English,480.0,10.0,Advanced,DH+,Intermediate
1,Allison Chan,in_progress,2024-01-18 01:59:25.000,6.0,IT Specialist,Programmable even-keeled process improvement,Intermediate,English,480.0,10.0,Advanced,DH+,Advanced
2,Allison Chan,in_progress,2024-01-18 01:59:25.000,6.0,IT Specialist,Programmable even-keeled process improvement,Intermediate,English,480.0,10.0,Advanced,DH+,Beginner
3,Allison Chan,in_progress,2024-01-18 01:59:25.000,6.0,IT Specialist,Programmable even-keeled process improvement,Intermediate,English,480.0,10.0,Advanced,DH+,Intermediate
4,Allison Chan,in_progress,2024-01-18 01:59:25.000,6.0,IT Specialist,Programmable even-keeled process improvement,Intermediate,English,480.0,10.0,Advanced,DH+,Intermediate
...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,Allison Chan,in_progress,2024-01-18 01:59:25.000,6.0,IT Specialist,Programmable even-keeled process improvement,Intermediate,English,480.0,10.0,Beginner,MBAL,Intermediate
96,Allison Chan,in_progress,2024-01-18 01:59:25.000,6.0,IT Specialist,Programmable even-keeled process improvement,Intermediate,English,480.0,10.0,Beginner,MBAL,Intermediate
97,Allison Chan,in_progress,2024-01-18 01:59:25.000,6.0,IT Specialist,Programmable even-keeled process improvement,Intermediate,English,480.0,10.0,Beginner,MBAL,Advanced
98,Allison Chan,in_progress,2024-01-18 01:59:25.000,6.0,IT Specialist,Programmable even-keeled process improvement,Intermediate,English,480.0,10.0,Beginner,MBAL,Beginner


SyntaxError: invalid syntax (3730208583.py, line 1)