In [20]:
import os
import pandas as pd
import pyodbc
from datetime import datetime
from decimal import Decimal

import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

In [21]:
# Define MSSQL connection details
# HOME MACHINE = DESKTOP-CMTGLLQ
# CORP MACHINE = JM-DKT-033
connection_string = 'DRIVER={SQL Server};SERVER=JM-DKT-033;DATABASE=JLEARN;trusted_connection=YES'

# Connect to the database
conn = pyodbc.connect(connection_string)
cursor = conn.cursor()

In [22]:
# Create directories for the Medallion architecture
base_dir = "warehouse"

if not os.path.exists(base_dir):
    os.makedirs(base_dir)

def create_folders():
    layers = ['raw', 'prep', 'mart']
    for directory in layers:
        if not os.path.exists(base_dir + "/" + directory):
            os.makedirs(base_dir + "/" + directory)

create_folders()


In [23]:
# # PREPARING DATABASE
# def clean_db():
#     schemas = ['raw','prep','mart']
#     tables = ['user_courses','users','courses']
#     def create_schema(schema):
#         query = f"select schema_id('{schema}');"
#         cursor.execute(query)
#         res = tuple(cursor.fetchall()[0])[0]
#         if(res == None):
#             query = f"create schema {schema};"
#             cursor.execute(query)
#             print(f"{schema} schema created")
#             conn.commit()

#     def drop_tables(schema, table):
#         query = f"DROP TABLE IF EXISTS {schema}.{table};"
#         cursor.execute(query)
#         conn.commit()

#     def create_users_table(schema):
#         query = f"""CREATE TABLE {schema}.users (
#             UserID       INT            PRIMARY KEY,
#             UserName     NVARCHAR(100),
#             FullName     NVARCHAR(100),
#             Email        NVARCHAR(255) UNIQUE,
#             PasswordHash NVARCHAR(255),
#             Role         NVARCHAR(50)   DEFAULT 'employee',
#             RegisteredAt DATETIME       DEFAULT GETDATE()
#         )"""
#         cursor.execute(query)
#         conn.commit()

#     def create_courses_table(schema):
#         query = f"""CREATE TABLE {schema}.courses (
#             course_id           INT            PRIMARY KEY,
#             course_title        NVARCHAR(100),
#             num_subscribers     INT,
#             num_reviews         SMALLINT,
#             num_lectures        SMALLINT,
#             level               NVARCHAR(50),
#             content_duration    FLOAT,
#             published_timestamp NVARCHAR(50),
#             subject             NVARCHAR(50)
#         )"""
#         cursor.execute(query)
#         conn.commit()

#     def create_user_courses_table(schema):
#         query = f"""CREATE TABLE {schema}.user_courses (
#             user_course_id INT       PRIMARY KEY,
#             user_id        INT,
#             course_id      INT,
#             status         VARCHAR(50),
#             progress       DECIMAL(5, 2) DEFAULT 0.00,
#             enrolled_at    DATETIME      DEFAULT GETDATE(),
#             completed_at   DATETIME,
#             score          DECIMAL(5, 2),
#             CONSTRAINT FK_user_courses_users FOREIGN KEY (user_id) REFERENCES raw.users(UserID),
#             CONSTRAINT FK_user_courses_courses FOREIGN KEY (course_id) REFERENCES raw.courses(course_id)
#         )"""
#         cursor.execute(query)
#         conn.commit()

#     for schema in schemas:
#         create_schema(schema)
#         for table in tables:
#             drop_tables(schema, table)

#     create_users_table('raw')
#     create_courses_table('raw')
#     create_user_courses_table('raw')

In [24]:
# Fetch data using pyodbc and convert to pandas DataFrame
def fetch_data(query, conn):
    cursor = conn.cursor()
    cursor.execute(query)
    columns = [column[0] for column in cursor.description]
    data = cursor.fetchall()
    df = pd.DataFrame([tuple(row) for row in data], columns=columns)
    return df


In [25]:
# Utility function to save data to both CSV and SQL Server

def save_data(df, table_name, stage):

    # Save to CSV
    if "PasswordHash" in df.columns:
        df = df.drop("PasswordHash", axis=1) 
        
    file_path = os.path.join(base_dir + "\\" +stage, f'{table_name}.csv')
    # {datetime.now().strftime("%Y%m%d_%H%M%S")}
    # print(file_path)
    df.to_csv(file_path, index=False)

    # Save to SQL Server

    # df = df.applymap(lambda x: x.strftime('%Y-%m-%d %H:%M:%S') if isinstance(x, pd.Timestamp) else x)
    # df = df.applymap(lambda x: float(x) if isinstance(x, Decimal) else x)
    # df = df.applymap(lambda x: None if isinstance(x, pd.notna) else x)
    # df = df.applymap(lambda x: None if (isinstance(x, Decimal) and (x.is_nan() or x == Decimal('NaN'))) else float(x) if isinstance(x, Decimal) else x)
    # df = df.replace("'","", regex=True)

    
    # conn_str = f"INSERT INTO {schema}.{table_name} ({', '.join(df.columns)}) VALUES "
    # values = ', '.join([str(tuple(row)) for row in df.values])
    # query = conn_str + values
    # print(values)
    # with conn.cursor() as cursor:
    #     cursor.execute(query)
    #     conn.commit()

In [26]:
# DON'T FORGET TO CREATE RESPECTIVE TABLES IN EACH SCHEMA


# ----------------------------------------------
# 1. RAW Layer: Raw data ingestion from tables
# ----------------------------------------------
# Ingest raw data from SQL Server
def raw_ingestion():
    user_query = "SELECT * FROM users"
    course_query = "SELECT * FROM courses"
    user_courses_query = "SELECT * FROM user_courses"
    
    # Fetching raw data
    users_df = fetch_data(user_query, conn)
    courses_df = fetch_data(course_query, conn)
    user_courses_df = fetch_data(user_courses_query, conn)
    
    # Save raw data
    save_data(users_df, 'users','raw')
    save_data(courses_df, 'courses', 'raw')
    save_data(user_courses_df, 'user_courses', 'raw')

# clean_db()
raw_ingestion()

In [27]:
# ----------------------------------------------
# 2. PREP Layer: Cleansing and Enrichment
# ----------------------------------------------
def prep_transformation():
    # Clean and join the data (Enrichment)
    raw_courses_df = pd.read_csv("warehouse/raw/courses.csv")
    raw_user_courses_df = pd.read_csv("warehouse/raw/user_courses.csv")
    raw_users_df = pd.read_csv("warehouse/raw/users.csv")
    

    raw_users_df = raw_users_df.dropna()
    raw_courses_df = raw_courses_df.dropna()
    raw_user_courses_df = raw_user_courses_df.dropna()

    save_data(raw_users_df, 'users', 'prep')
    save_data(raw_courses_df, 'courses', 'prep')
    save_data(raw_user_courses_df, 'user_courses', 'prep')
    

prep_transformation()

In [28]:
# ----------------------------------------------
# 3. MART Layer: Aggregation and Analysis
# ----------------------------------------------
raw_ml_input = None

def mart_transformation():
    global raw_ml_input
    # Aggregate course completion statistics
    # gold_query = """
    # SELECT c.course_title, COUNT(uc.user_course_id) as num_users, AVG(uc.progress) as avg_progress
    # FROM user_courses uc
    # JOIN courses c ON uc.course_id = c.course_id
    # WHERE uc.progress = 100
    # GROUP BY c.course_title
    # """

    prep_courses_df = pd.read_csv("warehouse/prep/courses.csv")
    prep_user_courses_df = pd.read_csv("warehouse/prep/user_courses.csv")
    prep_users_df = pd.read_csv("warehouse/prep/users.csv")

    merged_df = pd.merge(prep_users_df, prep_user_courses_df, left_on='UserID', right_on='user_id', how='inner')

    # Step 2: Merge the result with courses on course_id
    final_merged_df = pd.merge(merged_df, prep_courses_df, left_on='course_id', right_on='course_id', how='inner')
    final_merged_df = final_merged_df.drop("user_id", axis=1)

    final_merged_df['enrolled_at'] = pd.to_datetime(final_merged_df['enrolled_at'])

    raw_ml_input = final_merged_df.sort_values(by='enrolled_at').groupby('UserID').agg({
        'course_id': list,
        'score': list
    }).reset_index()

    save_data(final_merged_df, 'report', 'mart')

mart_transformation()



In [29]:
# Close connection
cursor.close()
conn.close()

In [30]:
# Sample data - UserIDs, course sequences and scores
import collections

def build_dataset(words):
    count = collections.Counter(words).most_common()
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = float(len(dictionary) + 1)
    reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return dictionary, reverse_dictionary


courses_data = pd.read_csv("csv/data.csv")
vocab_size = len(courses_data) + 1

courses_dict, reverse_course_dict = (build_dataset(courses_data["course_id"]))

In [31]:
prep_courses_df = pd.read_csv("warehouse/prep/courses.csv")
prep_user_courses_df = pd.read_csv("warehouse/prep/user_courses.csv")
prep_users_df = pd.read_csv("warehouse/prep/users.csv")

merged_df = pd.merge(prep_users_df, prep_user_courses_df, left_on='UserID', right_on='user_id', how='inner')

# Step 2: Merge the result with courses on course_id
final_merged_df = pd.merge(merged_df, prep_courses_df, left_on='course_id', right_on='course_id', how='inner')
final_merged_df = final_merged_df.drop("user_id", axis=1)

final_merged_df['enrolled_at'] = pd.to_datetime(final_merged_df['enrolled_at'])

data = final_merged_df.sort_values(by='enrolled_at').groupby('UserID').agg({
    'course_id': list,
    'score': list
}).reset_index()

In [32]:
X = []
y = []

data['course_id'] = data['course_id'].apply(lambda x: [courses_dict[course] for course in x])
for courses in data['course_id']:
    for i in range(1, len(courses)):
        X.append(courses[:i])       # All courses up to i
        y.append(courses[i])        # Next course (course_id)

# Pad sequences to ensure uniform input shape
max_seq_length = max([len(seq) for seq in X])
X_padded = pad_sequences(X, maxlen=max_seq_length)

X_padded = np.array(X_padded)
y_encoded = to_categorical(y)

In [33]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_padded, y_encoded, test_size=0.2)

In [34]:
# # Define the model
embedding_dim = 8  # Embedding dimension

model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, mask_zero=True),
    LSTM(64),
    Dense(vocab_size, activation='softmax')  # Output probabilities for course IDs
])

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [35]:

# Train the model
model.fit(X_train, y_train, epochs=20, batch_size=2, validation_data=(X_test, y_test))


Epoch 1/20
[1m73/73[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 18ms/step - accuracy: 0.0000e+00 - loss: 4.3951 - val_accuracy: 0.0000e+00 - val_loss: 4.3944
Epoch 2/20
[1m73/73[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.0764 - loss: 4.3736 - val_accuracy: 0.0000e+00 - val_loss: 4.3917
Epoch 3/20
[1m73/73[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.1111 - loss: 4.3419 - val_accuracy: 0.0000e+00 - val_loss: 4.3930
Epoch 4/20
[1m73/73[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.0324 - loss: 4.2575 - val_accuracy: 0.0270 - val_loss: 4.4531
Epoch 5/20
[1m73/73[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.0058 - loss: 4.1454 - val_accuracy: 0.0270 - val_loss: 4.5546
Epoch 6/20
[1m73/73[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.0214 - loss: 3.9741 - val_accuracy: 0.0270 - val_loss: 4.5517
Epoch 7/20
[1m73/73[0

<keras.src.callbacks.history.History at 0x260e5487c50>

In [36]:
model.save("ml_artifacts/lstm_v1.keras")