In [None]:
# pip install pandas sqlalchemy

RAW

In [94]:
import pandas as pd
from sqlalchemy import create_engine
import os
from dotenv import load_dotenv

load_dotenv()


True

In [95]:
DATABASE_URL = os.getenv("PG_DATABASE_URL")
print(DATABASE_URL)
engine = create_engine(DATABASE_URL)

tables = ["User", "Designation", "Course", "EmployeeProgress", "Skill", "UserSkill", "DesignationSkill", "CourseSkill"]

raw_data_dir = "../staging/raw"
os.makedirs(raw_data_dir, exist_ok=True)

postgresql://postgres:1234@localhost:5432/course_management


In [96]:
for table in tables:
    df = pd.read_sql_table(table, con=engine)
    df.to_csv(os.path.join(raw_data_dir, f'{table}.csv'), index=False)

print("Raw data extraction complete!")

Raw data extraction complete!


Standardised

In [97]:
def count_nulls(file_path):
    df = pd.read_csv(file_path)
    null_counts = df.isnull().sum()
    return null_counts[null_counts > 0]  


In [98]:
null_counts_dict = {}

for table in tables:
    file_path = os.path.join(raw_data_dir, f'{table}.csv')  # Read from the raw directory
    null_counts = count_nulls(file_path)
    null_counts_dict[table] = null_counts

for table, counts in null_counts_dict.items():
    print(f"Null counts for {table}:\n{counts}\n")


Null counts for User:
designationId    1
dtype: int64

Null counts for Designation:
Series([], dtype: int64)

Null counts for Course:
Series([], dtype: int64)

Null counts for EmployeeProgress:
certificateProof    1739
dtype: int64

Null counts for Skill:
Series([], dtype: int64)

Null counts for UserSkill:
Series([], dtype: int64)

Null counts for DesignationSkill:
Series([], dtype: int64)

Null counts for CourseSkill:
Series([], dtype: int64)



In [102]:
tables_reqd_columns = {
    "User": {
        "file": "User.csv",
        "columns": ["id","username", "email", "role", "designationId", "createdAt", "updatedAt"]
    },
    "Designation": {
        "file": "Designation.csv",
        "columns": ["id", "name"]
    },
    "Course": {
        "file": "Course.csv",
        "columns": ["course_id","title", "difficulty", "language","totalTime", "totalModules", "created_at", "updated_at"]
    },
    "EmployeeProgress": {
        "file": "EmployeeProgress.csv",
        "columns": ["userId", "courseId", "progressStatus", "last_updated", "modulesCompleted"]
    },

    "Skill": {
        "file": "Skill.csv",
        "columns": ["id","name"]
    },
    "UserSkill": {
        "file": "UserSkill.csv",
        "columns": ["userId", "skillId", "level"]
    },
    "DesignationSkill": {
        "file": "DesignationSkill.csv",
        "columns": ["designationId", "skillId"]
    },
    "CourseSkill": {
        "file": "CourseSkill.csv",
        "columns": ["courseId", "skillId", "level"]
    }
}

In [101]:
standardised_data_dir = '../staging/standardised/'
os.makedirs(standardised_data_dir, exist_ok=True)

In [103]:
cleaned_dfs = {}

for model, info in tables_reqd_columns.items():
    file_path = os.path.join(raw_data_dir, info["file"])
    df = pd.read_csv(file_path)

    df_cleaned = df[info["columns"]].copy() 

    for col in df_cleaned.columns:
        if df_cleaned[col].dtype == 'object': 
            df_cleaned.loc[:, col] = df_cleaned[col].astype('category') 
        elif df_cleaned[col].dtype == 'int64':  
            df_cleaned.loc[:, col] = df_cleaned[col].astype('Int64')  

    if model == "User":
        df_cleaned = df_cleaned.dropna()

    cleaned_dfs[model] = df_cleaned

for model, df in cleaned_dfs.items():
    standardized_file_path = os.path.join(standardised_data_dir, f"prep_{model}.csv")
    df.to_csv(standardized_file_path, index=False)

print("All data cleaned and saved to the standardized folder!")

All data cleaned and saved to the standardized folder!
