In [62]:
import os
import pandas as pd
import pyodbc
from datetime import datetime
from decimal import Decimal

In [63]:
# Define MSSQL connection details
# HOME MACHINE = DESKTOP-CMTGLLQ
# CORP MACHINE = JM-DKT-033
connection_string = 'DRIVER={SQL Server};SERVER=JM-DKT-033;DATABASE=JLEARN;trusted_connection=YES'

# Connect to the database
conn = pyodbc.connect(connection_string)
cursor = conn.cursor()

In [64]:
# Create directories for the Medallion architecture
base_dir = "warehouse"

if not os.path.exists(base_dir):
    os.makedirs(base_dir)

def create_folders():
    layers = ['raw', 'prep', 'mart']
    for directory in layers:
        if not os.path.exists(base_dir + "/" + directory):
            os.makedirs(base_dir + "/" + directory)

create_folders()


In [65]:
# # PREPARING DATABASE
# def clean_db():
#     schemas = ['raw','prep','mart']
#     tables = ['user_courses','users','courses']
#     def create_schema(schema):
#         query = f"select schema_id('{schema}');"
#         cursor.execute(query)
#         res = tuple(cursor.fetchall()[0])[0]
#         if(res == None):
#             query = f"create schema {schema};"
#             cursor.execute(query)
#             print(f"{schema} schema created")
#             conn.commit()

#     def drop_tables(schema, table):
#         query = f"DROP TABLE IF EXISTS {schema}.{table};"
#         cursor.execute(query)
#         conn.commit()

#     def create_users_table(schema):
#         query = f"""CREATE TABLE {schema}.users (
#             UserID       INT            PRIMARY KEY,
#             UserName     NVARCHAR(100),
#             FullName     NVARCHAR(100),
#             Email        NVARCHAR(255) UNIQUE,
#             PasswordHash NVARCHAR(255),
#             Role         NVARCHAR(50)   DEFAULT 'employee',
#             RegisteredAt DATETIME       DEFAULT GETDATE()
#         )"""
#         cursor.execute(query)
#         conn.commit()

#     def create_courses_table(schema):
#         query = f"""CREATE TABLE {schema}.courses (
#             course_id           INT            PRIMARY KEY,
#             course_title        NVARCHAR(100),
#             num_subscribers     INT,
#             num_reviews         SMALLINT,
#             num_lectures        SMALLINT,
#             level               NVARCHAR(50),
#             content_duration    FLOAT,
#             published_timestamp NVARCHAR(50),
#             subject             NVARCHAR(50)
#         )"""
#         cursor.execute(query)
#         conn.commit()

#     def create_user_courses_table(schema):
#         query = f"""CREATE TABLE {schema}.user_courses (
#             user_course_id INT       PRIMARY KEY,
#             user_id        INT,
#             course_id      INT,
#             status         VARCHAR(50),
#             progress       DECIMAL(5, 2) DEFAULT 0.00,
#             enrolled_at    DATETIME      DEFAULT GETDATE(),
#             completed_at   DATETIME,
#             score          DECIMAL(5, 2),
#             CONSTRAINT FK_user_courses_users FOREIGN KEY (user_id) REFERENCES raw.users(UserID),
#             CONSTRAINT FK_user_courses_courses FOREIGN KEY (course_id) REFERENCES raw.courses(course_id)
#         )"""
#         cursor.execute(query)
#         conn.commit()

#     for schema in schemas:
#         create_schema(schema)
#         for table in tables:
#             drop_tables(schema, table)

#     create_users_table('raw')
#     create_courses_table('raw')
#     create_user_courses_table('raw')

In [66]:
# Fetch data using pyodbc and convert to pandas DataFrame
def fetch_data(query, conn):
    cursor = conn.cursor()
    cursor.execute(query)
    columns = [column[0] for column in cursor.description]
    data = cursor.fetchall()
    df = pd.DataFrame([tuple(row) for row in data], columns=columns)
    return df


In [67]:
# Utility function to save data to both CSV and SQL Server

def save_data(df, table_name, stage):

    # Save to CSV
    if "PasswordHash" in df.columns:
        df = df.drop("PasswordHash", axis=1) 
        
    file_path = os.path.join(base_dir + "\\" +stage, f'{table_name}.csv')
    # {datetime.now().strftime("%Y%m%d_%H%M%S")}
    # print(file_path)
    df.to_csv(file_path, index=False)

    # Save to SQL Server

    # df = df.applymap(lambda x: x.strftime('%Y-%m-%d %H:%M:%S') if isinstance(x, pd.Timestamp) else x)
    # df = df.applymap(lambda x: float(x) if isinstance(x, Decimal) else x)
    # df = df.applymap(lambda x: None if isinstance(x, pd.notna) else x)
    # df = df.applymap(lambda x: None if (isinstance(x, Decimal) and (x.is_nan() or x == Decimal('NaN'))) else float(x) if isinstance(x, Decimal) else x)
    # df = df.replace("'","", regex=True)

    
    # conn_str = f"INSERT INTO {schema}.{table_name} ({', '.join(df.columns)}) VALUES "
    # values = ', '.join([str(tuple(row)) for row in df.values])
    # query = conn_str + values
    # print(values)
    # with conn.cursor() as cursor:
    #     cursor.execute(query)
    #     conn.commit()

In [68]:
# DON'T FORGET TO CREATE RESPECTIVE TABLES IN EACH SCHEMA


# ----------------------------------------------
# 1. RAW Layer: Raw data ingestion from tables
# ----------------------------------------------
# Ingest raw data from SQL Server
def raw_ingestion():
    user_query = "SELECT * FROM users"
    course_query = "SELECT * FROM courses"
    user_courses_query = "SELECT * FROM user_courses"
    
    # Fetching raw data
    users_df = fetch_data(user_query, conn)
    courses_df = fetch_data(course_query, conn)
    user_courses_df = fetch_data(user_courses_query, conn)
    
    # Save raw data
    save_data(users_df, 'users','raw')
    save_data(courses_df, 'courses', 'raw')
    save_data(user_courses_df, 'user_courses', 'raw')

# clean_db()
raw_ingestion()

In [69]:
# ----------------------------------------------
# 2. PREP Layer: Cleansing and Enrichment
# ----------------------------------------------
def prep_transformation():
    # Clean and join the data (Enrichment)
    raw_courses_df = pd.read_csv("warehouse/raw/courses.csv")
    raw_user_courses_df = pd.read_csv("warehouse/raw/user_courses.csv")
    raw_users_df = pd.read_csv("warehouse/raw/users.csv")
    

    raw_users_df = raw_users_df.dropna()
    raw_courses_df = raw_courses_df.dropna()
    raw_user_courses_df = raw_user_courses_df.dropna()

    save_data(raw_users_df, 'users', 'prep')
    save_data(raw_courses_df, 'courses', 'prep')
    save_data(raw_user_courses_df, 'user_courses', 'prep')
    

prep_transformation()

In [70]:
# ----------------------------------------------
# 3. MART Layer: Aggregation and Analysis
# ----------------------------------------------
def mart_transformation():
    # Aggregate course completion statistics
    # gold_query = """
    # SELECT c.course_title, COUNT(uc.user_course_id) as num_users, AVG(uc.progress) as avg_progress
    # FROM user_courses uc
    # JOIN courses c ON uc.course_id = c.course_id
    # WHERE uc.progress = 100
    # GROUP BY c.course_title
    # """

    prep_courses_df = pd.read_csv("warehouse/prep/courses.csv")
    prep_user_courses_df = pd.read_csv("warehouse/prep/user_courses.csv")
    prep_users_df = pd.read_csv("warehouse/prep/users.csv")

    merged_df = pd.merge(prep_users_df, prep_user_courses_df, left_on='UserID', right_on='user_id', how='inner')

    # Step 2: Merge the result with courses on course_id
    final_merged_df = pd.merge(merged_df, prep_courses_df, left_on='course_id', right_on='course_id', how='inner')
    final_merged_df = final_merged_df.drop("user_id", axis=1)
    # Save aggregated data
    save_data(final_merged_df, 'report', 'mart')

mart_transformation()



In [71]:
# Close connection
cursor.close()
conn.close()