In [4]:
import pandas as pd
import os

In [5]:
RAW_DATA_DIR = "../../data/bronze"  # Path to the raw data
TRANSFORMED_DATA_DIR = "../../data/silver"  # Path to save transformed data
os.makedirs(TRANSFORMED_DATA_DIR, exist_ok=True)

In [6]:
def count_nulls(file_path):
    df = pd.read_csv(file_path)
    null_counts = df.isnull().sum()
    return null_counts[null_counts > 0]

In [7]:
tables = [
    "User", "Designation", "Course", "EmployeeProgress", 
    "Skill", "UserSkill", "DesignationSkill", "CourseSkill"
]

null_counts_dict = {}

for table in tables:
    file_path = os.path.join(RAW_DATA_DIR, f'{table}.csv')  # Read from the raw directory
    null_counts = count_nulls(file_path)
    null_counts_dict[table] = null_counts

for table, counts in null_counts_dict.items():
    print(f"Null counts for {table}:\n{counts}\n")

Null counts for User:
Series([], dtype: int64)

Null counts for Designation:
Series([], dtype: int64)

Null counts for Course:
Series([], dtype: int64)

Null counts for EmployeeProgress:
certificateProof    14
dtype: int64

Null counts for Skill:
Series([], dtype: int64)

Null counts for UserSkill:
Series([], dtype: int64)

Null counts for DesignationSkill:
Series([], dtype: int64)

Null counts for CourseSkill:
Series([], dtype: int64)



In [8]:
# Cell 4: Define required columns for each table
tables_reqd_columns = {
    "User": {
        "file": "User.csv",
        "columns": ["id", "username", "email", "role", "designationId", "createdAt", "updatedAt"]
    },
    "Designation": {
        "file": "Designation.csv",
        "columns": ["id", "name"]
    },
    "Course": {
        "file": "Course.csv",
        "columns": ["course_id", "title", "difficulty", "language", "totalTime", "totalModules", "created_at", "updated_at"]
    },
    "EmployeeProgress": {
        "file": "EmployeeProgress.csv",
        "columns": ["userId", "courseId", "progressStatus", "last_updated", "modulesCompleted"]
    },
    "Skill": {
        "file": "Skill.csv",
        "columns": ["id", "name"]
    },
    "UserSkill": {
        "file": "UserSkill.csv",
        "columns": ["userId", "skillId", "level"]
    },
    "DesignationSkill": {
        "file": "DesignationSkill.csv",
        "columns": ["designationId", "skillId"]
    },
    "CourseSkill": {
        "file": "CourseSkill.csv",
        "columns": ["courseId", "skillId", "level"]
    }
}


In [9]:
cleaned_dfs = {}

for model, info in tables_reqd_columns.items():
    file_path = os.path.join(RAW_DATA_DIR, info["file"])
    df = pd.read_csv(file_path)

    # Select required columns
    df_cleaned = df[info["columns"]].copy() 

    # Convert data types
    for col in df_cleaned.columns:
        if df_cleaned[col].dtype == 'object': 
            df_cleaned.loc[:, col] = df_cleaned[col].astype('category') 
        elif df_cleaned[col].dtype == 'int64':  
            df_cleaned.loc[:, col] = df_cleaned[col].astype('Int64')  

    # Drop rows with nulls for User table
    if model == "User":
        df_cleaned = df_cleaned.dropna()

    cleaned_dfs[model] = df_cleaned


In [10]:
for model, df in cleaned_dfs.items():
    transformed_file_path = os.path.join(TRANSFORMED_DATA_DIR, f"prep_{model}.csv")
    df.to_csv(transformed_file_path, index=False)

print("All data cleaned and saved to the transformed folder!")


All data cleaned and saved to the transformed folder!
