In [37]:
import pandas as pd
import os

Setting Up Data Directories

In [38]:
RAW_DATA_DIR = "../../data/bronze"  # Path to the raw data
TRANSFORMED_DATA_DIR = "../../data/silver"  # Path to save transformed data
os.makedirs(TRANSFORMED_DATA_DIR, exist_ok=True)

Check number of null and duplicates

In [39]:
def count_nulls(file_path):
    df = pd.read_csv(file_path)
    return df.isnull().sum()

def count_duplicates(file_path):
    df = pd.read_csv(file_path)
    return df.duplicated().sum()

In [41]:
tables = [
    "User", "Designation", "Course", "EmployeeProgress", 
    "Skill", "UserSkill", "DesignationSkill", "CourseSkill"
]

null_counts_dict = {}
duplicate_counts_dict = {}


for table in tables:
    file_path = os.path.join(RAW_DATA_DIR, f'raw_{table}.csv')
    
    # Count nulls
    null_counts = count_nulls(file_path)
    null_counts_dict[table] = null_counts
    
    # Count duplicates
    duplicate_counts = count_duplicates(file_path)
    duplicate_counts_dict[table] = duplicate_counts

for table, counts in null_counts_dict.items():
    print(f"Null counts for {table}:\n{counts}\n")

for table, count in duplicate_counts_dict.items():
    print(f"Duplicate counts for {table}: {count}\n")

Null counts for User:
id               0
username         0
passwordHash     0
email            0
gender           0
role             0
designationId    1
phoneNumber      0
createdAt        0
updatedAt        0
dtype: int64

Null counts for Designation:
id      0
name    0
dtype: int64

Null counts for Course:
course_id       0
title           0
url             0
shortIntro      0
difficulty      0
language        0
totalTime       0
totalModules    0
created_at      0
updated_at      0
dtype: int64

Null counts for EmployeeProgress:
progress_id            0
userId                 0
courseId               0
progressStatus         0
last_updated           0
modulesCompleted       0
certificateProof    1739
dtype: int64

Null counts for Skill:
id      0
name    0
dtype: int64

Null counts for UserSkill:
id         0
userId     0
skillId    0
level      0
dtype: int64

Null counts for DesignationSkill:
id               0
designationId    0
skillId          0
dtype: int64

Null counts for

Define required columns for each table

In [42]:
tables_reqd_columns = {
    "User": {
        "file": "User.csv",
        "columns": ["id", "username", "designationId", "createdAt"]
    },
    "Designation": {
        "file": "Designation.csv",
        "columns": ["id", "name"]
    },
    "Course": {
        "file": "Course.csv",
        "columns": ["course_id", "title", "difficulty", "language", "totalTime", "totalModules", "created_at"]
    },
    "EmployeeProgress": {
        "file": "EmployeeProgress.csv",
        "columns": ["userId", "courseId", "progressStatus", "last_updated", "modulesCompleted", "certificateProof"]
    },
    "Skill": {
        "file": "Skill.csv",
        "columns": ["id", "name"]
    },
    "UserSkill": {
        "file": "UserSkill.csv",
        "columns": ["userId", "skillId", "level"]
    },
    "DesignationSkill": {
        "file": "DesignationSkill.csv",
        "columns": ["designationId", "skillId"]
    },
    "CourseSkill": {
        "file": "CourseSkill.csv",
        "columns": ["courseId", "skillId", "level"]
    }
}


Data Preprocessing: Removing Nulls, Selecting Required Columns, and Type Conversion

In [49]:
cleaned_dfs = {}

for model, info in tables_reqd_columns.items():
    file_path = os.path.join(RAW_DATA_DIR, f'raw_{info["file"]}')
    df = pd.read_csv(file_path)

    df_cleaned = df[info["columns"]].copy() 

    for col in df_cleaned.columns:
        if df_cleaned[col].dtype == 'object': 
            df_cleaned[col] = df_cleaned[col].astype('category') 
        elif df_cleaned[col].dtype == 'int64':  
            df_cleaned[col] = df_cleaned[col].astype('Int64')  
    
    if model == "User":
        df_cleaned = df_cleaned.dropna()

    if 'designationId' in df_cleaned.columns:
        df_cleaned['designationId'] = df_cleaned['designationId'].astype('Int64')  

    if model == "User":
        valid_user_ids = df_cleaned['id'].unique()  

    if model == "EmployeeProgress":
        df_cleaned['has_certificate'] = df_cleaned['certificateProof'].notnull().astype(int)
        df_cleaned = df_cleaned.drop(columns=['certificateProof'])
        if 'userId' in df_cleaned.columns:  
            df_cleaned = df_cleaned[df_cleaned['userId'].isin(valid_user_ids)]

    if model == "UserSkill":
        if 'userId' in df_cleaned.columns:  
            df_cleaned = df_cleaned[df_cleaned['userId'].isin(valid_user_ids)]  

    cleaned_dfs[model] = df_cleaned

Exporting Cleaned Data to CSV

In [50]:
for model, df in cleaned_dfs.items():
    transformed_file_path = os.path.join(TRANSFORMED_DATA_DIR, f"prep_{model}.csv")
    df.to_csv(transformed_file_path, index=False)

print("All data cleaned and saved to the transformed folder!")


All data cleaned and saved to the transformed folder!


In [51]:
for model, df in cleaned_dfs.items():
    print(model, df.columns)

User Index(['id', 'username', 'designationId', 'createdAt'], dtype='object')
Designation Index(['id', 'name'], dtype='object')
Course Index(['course_id', 'title', 'difficulty', 'language', 'totalTime',
       'totalModules', 'created_at'],
      dtype='object')
EmployeeProgress Index(['userId', 'courseId', 'progressStatus', 'last_updated',
       'modulesCompleted', 'has_certificate'],
      dtype='object')
Skill Index(['id', 'name'], dtype='object')
UserSkill Index(['userId', 'skillId', 'level'], dtype='object')
DesignationSkill Index(['designationId', 'skillId'], dtype='object')
CourseSkill Index(['courseId', 'skillId', 'level'], dtype='object')
