# Business Objective: Online Course Recommendation System
## Problem Statement
The goal of this dataset is to build an online course recommendation system that suggests relevant courses to learners based on their interests, past enrollments, and engagement levels. The dataset includes course ratings, instructor information, previous learning history, study material availability, and certification offerings, making it suitable for recommendation models using collaborative filtering, content-based filtering, or hybrid approaches.

## Variable Descriptions
    Variable Name -- Data Type -- Description
    user_id -- Integer -- Unique identifier for each learner.
    course_id -- Integer -- Unique identifier for each online course.
    course_name -- String -- The name of the online course.
    instructor -- String -- The name of the instructor teaching the course.
    course_duration_hours -- Float (5.0 - 100.0) -- The duration of the course in hours.
    certification_offered -- String (Yes/No) -- Indicates whether the course provides a certification upon completion.
    difficulty_level -- String -- The difficulty level of the course (Beginner, Intermediate, Advanced).
    rating -- Float (1.0 - 5.0) -- User-provided rating for the course.
    enrollment_numbers -- Integer -- The total number of students enrolled in the course.
    course_price -- Float (20.0 - 500.0) -- The price of the online course.
    feedback_score -- Float (0.0 - 1.0) -- A normalized score representing the feedback sentiment from students.
    study_material_available -- String (Yes/No) -- Indicates whether additional study materials are available.
    time_spent_hours -- Float (1.0 - 100.0) -- The average time spent by students in the course (in hours).
    previous_courses_taken -- Integer -- The number of previous courses the learner has taken before enrolling in this one.


In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

In [None]:
!pip install openpyxl

In [None]:
from google.colab import files
uploaded= files.upload()

In [None]:
df= pd.read_excel('online_course_recommendation_v2 (1).xlsx')
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.duplicated()

In [None]:
df.isnull().sum()

In [None]:
df['course_taken']= df.groupby('user_id')['user_id'].transform('count')

In [None]:
df1= df.sort_values(by= 'user_id', ascending= True).reset_index(drop= True)
display(df1)

In [None]:
df.drop('previous_courses_taken', axis= 1, inplace= True)

In [None]:
df.hist(bins=30, figsize=(15,10))
plt.suptitle("Distribution of Numeric Columns")
plt.show()

In [None]:
# # Selecting numeric columns
# numeric_cols= df.select_dtypes(include= ['float64', 'int64']).columns

# df_clean= df.copy()

# # Loop through each num cols
# for col in numeric_cols:
#   Q1= df[col].quantile(0.25)
#   Q3= df[col].quantile(0.75)
#   IQR= Q3- Q1
#   lower= Q1 - 1.5 * IQR
#   upper= Q3 + 1.5 * IQR
#   outliers= df[(df[col] < lower) | (df[col] > upper)]
#   print(f"{col}: {len(outliers)} outliers")

#   # Keep ionly rows within bounds
#   df_clean= df_clean[(df_clean[col] >= lower) & (df_clean[col] <= upper)]

# print('original shape: ', df.shape)
# print('After removing outliers:', df_clean.shape)

In [None]:
# df_clean.hist(bins=30, figsize=(15,10))
# plt.suptitle("Distribution of Numeric Columns")
# plt.show()

# EDA

In [None]:
# 1. Distribution of Course Ratings
sns.histplot(df['rating'], bins=20, kde=True)
plt.title('Distribution of Course Ratings')
plt.xlabel('Rating')
plt.ylabel('Frequency')
plt.show()

In [None]:
# 2. Difficulty levels
sns.countplot(x='difficulty_level', data= df)
plt.title('Count of Courses by Difficulty Level')
plt.show()

In [None]:
# 3. Certification offered
df['certification_offered'].value_counts().plot.pie(autopct= '%1.1f%%', colors= ['cornflowerblue', 'salmon'])
plt.title('Certification Offered: Yes/No')
plt.ylabel('')
plt.show()

In [None]:
# 4. Study Material Available
sns.countplot(x= 'study_material_available', data= df)
plt.title('Study Material Availability Count')
plt.show()

In [None]:
# 5. Top 20 Instructors by Number Of Courses
df['instructor'].value_counts().head(20).plot(kind= 'barh', figsize= (8,6), color= 'teal')
plt.title('Top 20 Instructors by Number of Courses')
plt.xlabel('Number of Courses')
plt.ylabel('Instructor')
plt.show()

In [None]:
# 6. Top 10 Most Enrolled Courses
top_courses= (df.groupby('course_name')['enrollment_numbers'].sum().sort_values(ascending= False).head(10).reset_index())
sns.barplot(x= 'enrollment_numbers', y= 'course_name', data= top_courses)
plt.title('Top 10 Most Enrolled Courses')
plt.xlabel('Total Enrollments')
plt.ylabel('Course Name')
plt.show()

In [None]:
!pip install wordcloud

In [None]:
# 7. Word Cloud for Course Names
from wordcloud import WordCloud

text= ' '.join(df['course_name'].astype(str))
wordcloud= WordCloud(width= 800, height= 400, background_color= 'white').generate(text)
plt.figure(figsize= (10, 5))
plt.imshow(wordcloud, interpolation= 'bilinear')
plt.axis('off')
plt.title('WordCloud - Course Names')
plt.show()

In [None]:
# 7. Word Cloud for Course Names
from wordcloud import WordCloud

text = ' '.join(df['instructor'].astype(str))
wordcloud = WordCloud(width=800, height=400, background_color='white', colormap= 'cool').generate(text)
plt.figure(figsize=(10,5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud - Instructors')
plt.show()

In [None]:
# 9. Frequency Tables & Cross-tabs
freq_table= df['difficulty_level'].value_counts()
print(freq_table)

cross_tab= pd.crosstab(df['difficulty_level'], df['certification_offered'])
print(cross_tab)

In [None]:
# 10. Average Rating by Difficulty level and Instructor
plt.figure(figsize= (14, 7))
sns.barplot(x= 'difficulty_level', y= 'rating', hue= 'instructor', data= df, ci=None)
plt.title('Average Rating by Difficulty level and Instructor')
plt.ylabel('Average Rating')
plt.xlabel('Difficulty Level')
plt.legend(bbox_to_anchor= (1.05, 1), loc= 'upper left')
plt.show()

In [None]:
# 11. Enrollment Numbers by Course Price & Difficulty Level
plt.figure(figsize= (8,6))
df_pivot= df.pivot_table(values= 'enrollment_numbers', index= 'course_price', columns= 'difficulty_level', aggfunc= 'mean').fillna(0)
sns.heatmap(df_pivot, cmap= 'Blues')
plt.title('Enrollment Numbers by Course Price & Difficulty Level')
plt.xlabel('Difficulty Level')
plt.ylabel('Course Price')
plt.show()

## Data Preprocessing

In [None]:
# # ============================================================
# # üì¶ FILE: preprocessing_pipeline.py
# # ============================================================
# # PURPOSE:
# #   1Ô∏è‚É£ Converts raw input data into a clean, model-ready DataFrame.
# #   2Ô∏è‚É£ Encodes categorical columns.
# #   3Ô∏è‚É£ Scales numerical columns using MinMaxScaler.
# #   4Ô∏è‚É£ Saves LabelEncoders and Scaler for consistent transformation.
# # ============================================================

# from sklearn.preprocessing import LabelEncoder, MinMaxScaler
# import joblib
# import os


# def prepare_course_data(df: pd.DataFrame, save_dir: str = "artifacts") -> pd.DataFrame:
#     """
#     Cleans and transforms raw course dataset into model-ready form.

#     Steps:
#     1. Encode Yes/No binary columns.
#     2. Drop unnecessary columns.
#     3. Label encode categorical features.
#     4. Apply MinMaxScaler to numerical columns.
#     5. Save encoders and scaler (for later inference).
#     6. Return a clean, processed DataFrame.

#     Parameters
#     ----------
#     df : pd.DataFrame
#         Raw input dataset.
#     save_dir : str, optional
#         Directory where encoders/scalers are saved (default is 'artifacts').

#     Returns
#     -------
#     pd.DataFrame
#         Cleaned and preprocessed dataset ready for model input.
#     """

#     # ------------------------------------------------------------
#     # üßπ Step 1: Encode Yes/No Columns to Binary
#     # ------------------------------------------------------------
#     if 'certification_offered' in df.columns:
#         df['certification_offered'] = df['certification_offered'].replace({'Yes': 1, 'No': 0})
#     if 'study_material_available' in df.columns:
#         df['study_material_available'] = df['study_material_available'].replace({'Yes': 1, 'No': 0})

#     # ------------------------------------------------------------
#     # üóëÔ∏è Step 2: Drop Unnecessary Columns
#     # ------------------------------------------------------------
#     if 'course_id' in df.columns:
#         df = df.drop(['course_id'], axis=1)

#     # ------------------------------------------------------------
#     # ‚ú® Step 3: Label Encode Categorical Columns
#     # ------------------------------------------------------------
#     le_course = LabelEncoder()
#     le_instructor = LabelEncoder()
#     le_difficulty = LabelEncoder()

#     df['course_name_enc'] = le_course.fit_transform(df['course_name'])
#     df['instructor_enc'] = le_instructor.fit_transform(df['instructor'])
#     df['difficulty_level_enc'] = le_difficulty.fit_transform(df['difficulty_level'])

#     # ------------------------------------------------------------
#     # üî¢ Step 4: Apply MinMax Scaling to Numeric Columns
#     # ------------------------------------------------------------
#     numeric_cols = [
#         'enrollment_numbers',
#         'course_price',
#         'course_duration_hours',
#         'feedback_score',
#         'time_spent_hours',
#         'rating',
#         'course_taken'
#     ]

#     scaler = MinMaxScaler()
#     df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

#     # ------------------------------------------------------------
#     # üíæ Step 5: Save All Encoders & Scaler for Deployment
#     # ------------------------------------------------------------
#     os.makedirs(save_dir, exist_ok=True)
#     joblib.dump(le_course, f"{save_dir}/le_course.pkl")
#     joblib.dump(le_instructor, f"{save_dir}/le_instructor.pkl")
#     joblib.dump(le_difficulty, f"{save_dir}/le_difficulty.pkl")
#     joblib.dump(scaler, f"{save_dir}/minmax_scaler.pkl")

#     # ------------------------------------------------------------
#     # üéØ Step 6: Keep Only Model-Relevant Columns
#     # ------------------------------------------------------------
#     scaled_df = df[[
#         'user_id',
#         'course_name_enc',
#         'instructor_enc',
#         'difficulty_level_enc'
#     ] + numeric_cols]

#     print(f"‚úÖ Data preprocessing complete. Artifacts saved to: '{save_dir}'")
#     return scaled_df

In [None]:
# from preprocessing_pipeline import prepare_course_data

# # Apply preprocessing
# scaled_df = prepare_course_data(df)
# print(scaled_df.head())

In [None]:
# =========================================================
# üì¶ FUNCTION: prepare_course_data()
# =========================================================
# Cleans, encodes, and scales course data for recommendation models.
# - Converts Yes/No columns to binary
# - Label-encodes categorical features
# - Applies MinMaxScaler to numeric features
# - Returns a clean, model-ready dataframe
# =========================================================

from sklearn.preprocessing import LabelEncoder, MinMaxScaler

def prepare_course_data(df: pd.DataFrame):
    """
    Prepare the dataset for CF, CBF, or Hybrid recommendation models.

    Steps:
    1. Encode binary Yes/No columns.
    2. Drop unnecessary identifiers.
    3. Label encode categorical text columns.
    4. Apply MinMax scaling to numeric columns.
    5. Return a clean, ready-to-train DataFrame.
    """

    # 1. Encode binary Yes/No
    if 'certification_offered' in df.columns:
        df['certification_offered'] = df['certification_offered'].replace({'Yes': 1, 'No': 0})
    if 'study_material_available' in df.columns:
        df['study_material_available'] = df['study_material_available'].replace({'Yes': 1, 'No': 0})

    # 2. Drop unnecessary columns
    if 'course_id' in df.columns:
        df = df.drop(['course_id'], axis=1)

    # 3. Label Encoding
    le_course = LabelEncoder()
    le_instructor = LabelEncoder()
    le_difficulty = LabelEncoder()

    df['course_name_enc'] = le_course.fit_transform(df['course_name'])
    df['instructor_enc'] = le_instructor.fit_transform(df['instructor'])
    df['difficulty_level_enc'] = le_difficulty.fit_transform(df['difficulty_level'])

    # 4. Scale numerical features
    numeric_cols = [
        'enrollment_numbers',
        'course_price',
        'course_duration_hours',
        'feedback_score',
        'time_spent_hours',
        'rating',
        'course_taken'
    ]

    scaler = MinMaxScaler()
    df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

    # 5. Final cleaned DataFrame
    scaled_df = df[[
        'user_id',
        'course_name_enc',
        'course_name',
        'instructor_enc',
        'instructor',
        'difficulty_level_enc',
        'difficulty_level'
    ] + numeric_cols]

    # ‚úÖ Return both the DataFrame and the preprocessing objects
    return scaled_df, le_course, le_instructor, le_difficulty, scaler

In [None]:
scaled_df, le_course, le_instructor, le_difficulty, scaler = prepare_course_data(df)

In [None]:
# Quick preview
scaled_df.head()

In [None]:
scaled_df[['course_name_enc', 'course_name']].drop_duplicates().sort_values('course_name_enc').reset_index(drop=True)

In [None]:
scaled_df[['instructor_enc', 'instructor']].drop_duplicates().sort_values('instructor_enc').reset_index(drop=True)

In [None]:
scaled_df[['difficulty_level_enc', 'difficulty_level']].drop_duplicates().sort_values('difficulty_level_enc').reset_index(drop=True)

### PCA & Feature Importance

In [None]:
from sklearn.decomposition import PCA

# --- Step 1: Define features for PCA ---
# We'll use encoded categorical and scaled numeric columns (no raw strings)
feature_cols = [
    'course_name_enc',
    'instructor_enc',
    'difficulty_level_enc',
    'enrollment_numbers',
    'course_price',
    'course_duration_hours',
    'feedback_score',
    'time_spent_hours',
    'rating',
    'course_taken'
]

X = scaled_df[feature_cols]

# --- Step 2: Apply PCA ---
pca = PCA(n_components=None, random_state=42)  # get all components to analyze variance
X_pca = pca.fit_transform(X)

# --- Step 3: Create a DataFrame of principal components ---
pca_df = pd.DataFrame(
    X_pca,
    columns=[f'PC{i+1}' for i in range(X_pca.shape[1])]
)

# Add course name (optional for labeling)
if 'course_name_enc' in scaled_df.columns:
    pca_df['course_name'] = scaled_df['course_name_enc']

# --- Step 4: Explained Variance Analysis ---
explained_var_ratio = pca.explained_variance_ratio_
cumulative_var = np.cumsum(explained_var_ratio)

print("Explained variance ratio per component:")
for i, var in enumerate(explained_var_ratio):
    print(f"PC{i+1}: {var:.4f} ({cumulative_var[i]*100:.2f}% cumulative)")

# --- Step 5: Plot variance explained ---
plt.figure(figsize=(8, 5))
plt.plot(range(1, len(explained_var_ratio) + 1), cumulative_var, marker='o')
plt.title('Cumulative Explained Variance by PCA Components')
plt.xlabel('Number of Principal Components')
plt.ylabel('Cumulative Explained Variance Ratio')
plt.grid(True)
plt.show()

# --- Step 6: 2D Visualization using first 2 principal components ---
plt.figure(figsize=(8, 6))
sns.scatterplot(
    x=pca_df['PC1'],
    y=pca_df['PC2'],
    s=100,
    color='teal',
    edgecolor='black'
)

# Optional: label some points
for i, txt in enumerate(pca_df['course_name'][:10]):
    plt.annotate(txt, (pca_df['PC1'][i], pca_df['PC2'][i]), fontsize=8)

plt.title('PCA Visualization (First Two Components)')
plt.xlabel(f"PC1 ({explained_var_ratio[0]*100:.1f}% var)")
plt.ylabel(f"PC2 ({explained_var_ratio[1]*100:.1f}% var)")
plt.grid(True)
plt.show()

# --- Step 7: Feature importance (PCA loadings) ---
loadings = pd.DataFrame(
    pca.components_.T,
    columns=[f'PC{i+1}' for i in range(X.shape[1])],
    index=feature_cols
)

print("\nTop feature contributions to the first few PCs:")
print(loadings.iloc[:, :3])  # top 3 components

# Optional: visualize feature importance heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(loadings.iloc[:, :5], annot=True, cmap='viridis')
plt.title('Feature Loadings for First 5 Principal Components')
plt.show()

In [None]:
!pip install implicit -q

## Model Building
### 1. COLLABORATIVE FILTERING (ALS)

In [None]:
from tqdm import tqdm
from scipy.sparse import csr_matrix
import implicit
from sklearn.model_selection import train_test_split

df_cf = scaled_df.copy()

# ---- Step 1: Rebuild interaction (scaled positive values) ----
df_cf['interaction'] = (
    0.7 * (df_cf['feedback_score'] - df_cf['feedback_score'].min()) /
          (df_cf['feedback_score'].max() - df_cf['feedback_score'].min() + 1e-6)
    +
    0.3 * (df_cf['time_spent_hours'] - df_cf['time_spent_hours'].min()) /
          (df_cf['time_spent_hours'].max() - df_cf['time_spent_hours'].min() + 1e-6)
)

# ---- Step 2: User‚Äìitem mapping ----
user_map = {u: i for i, u in enumerate(df_cf['user_id'].unique())}
item_map = {c: i for i, c in enumerate(df_cf['course_name_enc'].unique())}
user_inv = {i: u for u, i in user_map.items()}
item_inv = {i: c for c, i in item_map.items()}

rows = df_cf['user_id'].map(user_map)
cols = df_cf['course_name_enc'].map(item_map)
vals = df_cf['interaction']
interaction_matrix = csr_matrix((vals, (rows, cols)),
                                shape=(len(user_map), len(item_map)))

# ---- Step 3: Train/test split ----
train_df, test_df = train_test_split(df_cf, test_size=0.2, random_state=42)
train_rows = train_df['user_id'].map(user_map)
train_cols = train_df['course_name_enc'].map(item_map)
train_vals = train_df['interaction']
test_rows = test_df['user_id'].map(user_map)
test_cols = test_df['course_name_enc'].map(item_map)
test_vals = test_df['interaction']

train_matrix = csr_matrix((train_vals, (train_rows, train_cols)),
                          shape=(len(user_map), len(item_map)))
test_matrix = csr_matrix((test_vals, (test_rows, test_cols)),
                         shape=(len(user_map), len(item_map)))

# ---- Step 4: Helper functions ----
def recommend_cf(model, user_id, n=10):
    if user_id not in user_map:
        return "User not found."
    uid = user_map[user_id]
    recs = model.recommend(uid, train_matrix[uid], N=n)
    if isinstance(recs, tuple):
        item_ids, scores = recs
    else:
        item_ids, scores = zip(*recs)
    course_ids = [item_inv[i] for i in item_ids]

    # üîç Map encoded course IDs back to actual names
    course_lookup = df_cf[['course_name_enc', 'course_name']].drop_duplicates()
    recs_df = pd.DataFrame({'course_name_enc': course_ids, 'score': scores})
    recs_df = recs_df.merge(course_lookup, on='course_name_enc', how='left')

    # Arrange columns nicely
    recs_df = recs_df[['course_name_enc', 'course_name', 'score']]
    return recs_df

def precision_at_k(model, user_id, k=10):
    true_items = set(test_df[test_df['user_id'] == user_id]['course_name_enc'])
    if not true_items:
        return np.nan
    recs = recommend_cf(model, user_id, n=k)
    recommended_items = set(recs['course_name_enc'])
    return len(true_items.intersection(recommended_items)) / k

def recall_at_k(model, user_id, k=10):
    true_items = set(test_df[test_df['user_id'] == user_id]['course_name_enc'])
    if not true_items:
        return np.nan
    recs = recommend_cf(model, user_id, n=k)
    recommended_items = set(recs['course_name_enc'])
    return len(true_items.intersection(recommended_items)) / len(true_items)


# ---- Step 5: Light hyperparameter tuning ----
param_grid = [
    {'factors': 64, 'regularization': 0.05, 'iterations': 25},
    {'factors': 96, 'regularization': 0.05, 'iterations': 25},
    {'factors': 128, 'regularization': 0.08, 'iterations': 30},
]

results = []
sample_users = np.random.choice(df_cf['user_id'].unique(), size=30, replace=False)

for params in param_grid:
    print(f"\nüöÄ Training ALS: factors={params['factors']}, reg={params['regularization']}, iters={params['iterations']}")
    model = implicit.als.AlternatingLeastSquares(
        factors=params['factors'],
        regularization=params['regularization'],
        iterations=params['iterations'],
        random_state=42
    )
    model.fit(train_matrix)

    precisions, recalls = [], []
    for uid in tqdm(sample_users, desc="Evaluating"):
        p = precision_at_k(model, uid, k=10)
        r = recall_at_k(model, uid, k=10)
        if not np.isnan(p): precisions.append(p)
        if not np.isnan(r): recalls.append(r)

    mean_p, mean_r = np.mean(precisions), np.mean(recalls)
    f1 = 2 * (mean_p * mean_r) / (mean_p + mean_r + 1e-6)
    results.append({**params, 'precision': mean_p, 'recall': mean_r, 'f1': f1})

# ---- Step 6: Show results ----
results_df = pd.DataFrame(results).sort_values('f1', ascending=False)
print("\nüèÜ Best parameters found:")
print(results_df.head(3))

best_params = results_df.iloc[0]

# ---- Step 7: Final model ----
model_cf_final = implicit.als.AlternatingLeastSquares(
    factors=int(best_params['factors']),
    regularization=float(best_params['regularization']),
    iterations=int(best_params['iterations']),
    random_state=42
)
model_cf_final.fit(train_matrix)

# ---- Step 8: Evaluate final model ----
sample_users_final = np.random.choice(df_cf['user_id'].unique(), size=40, replace=False)
precisions, recalls = [], []
for uid in tqdm(sample_users_final, desc="Final Evaluation"):
    p = precision_at_k(model_cf_final, uid, k=10)
    r = recall_at_k(model_cf_final, uid, k=10)
    if not np.isnan(p): precisions.append(p)
    if not np.isnan(r): recalls.append(r)

print(f"\n‚úÖ Final ALS Model:")
print(f"Precision@10: {np.mean(precisions):.4f}")
print(f"Recall@10:    {np.mean(recalls):.4f}")

# ---- Step 9: Example Recommendations ----
example_user = df_cf['user_id'].iloc[0]
recs = recommend_cf(model_cf_final, example_user, n=10)
print(f"\nüéØ Top 10 Recommendations for User {example_user}:\n")
print(recs)

In [None]:
# VISUALIZING ALS EMBEDDINGS (FIXED FOR GPU)

# Convert implicit GPU matrices to CPU NumPy arrays if necessary
def to_numpy(matrix):
    """Converts implicit.gpu._cuda.Matrix or numpy array to numpy.ndarray"""
    try:
        return np.array(matrix.to_numpy())
    except AttributeError:
        return np.array(matrix)

# Extract latent factors safely
user_factors = to_numpy(model_cf.user_factors)
item_factors = to_numpy(model_cf.item_factors)

print(f"‚úÖ User factors shape: {user_factors.shape}")
print(f"‚úÖ Item factors shape: {item_factors.shape}")

# Dimensionality reduction for visualization
pca = PCA(n_components=2, random_state=42)
item_pca = pca.fit_transform(item_factors)
user_pca = pca.transform(user_factors)

# Prepare data for plotting
item_df = pd.DataFrame(item_pca, columns=['x', 'y'])
item_df['type'] = 'course'
item_df['course_name_enc'] = list(item_inv.values())

user_df = pd.DataFrame(user_pca, columns=['x', 'y'])
user_df['type'] = 'user'
user_df['user_id'] = list(user_inv.values())

# Plot user‚Äìcourse latent embeddings
plt.figure(figsize=(10, 7))
plt.scatter(item_df['x'], item_df['y'], s=60, c='royalblue', alpha=0.6, label='Courses')
plt.scatter(user_df['x'], user_df['y'], s=30, c='orangered', alpha=0.4, label='Users')
plt.title('User‚ÄìCourse Embedding Space (ALS Latent Factors)')
plt.xlabel('Latent Dimension 1')
plt.ylabel('Latent Dimension 2')
plt.legend()
plt.grid(True, linestyle='--', alpha=0.5)
plt.show()

# Helper: find closest courses to a given user
def show_user_neighbors(user_id, top_k=5):
    """Show the nearest courses in latent space to a given user."""
    if user_id not in user_map:
        return "User not found."

    uid = user_map[user_id]
    user_vec = user_factors[uid].reshape(1, -1)
    sims = np.dot(item_factors, user_vec.T).flatten()
    top_idx = np.argsort(-sims)[:top_k]
    top_courses = [item_inv[i] for i in top_idx]

    return pd.DataFrame({
        'course_name_enc': top_courses,
        'similarity': sims[top_idx]
    })

# Example usage:
neighbors = show_user_neighbors(15796, top_k=5)
print("\nüéØ Courses closest to user 15796 in latent space:\n")
print(neighbors)

### 2. CONTENT-BASED FILTERING (CBF)

In [None]:
# CONTENT-BASED FILTERING (CBF)
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import ndcg_score
from tqdm import tqdm

# --- Step 1Ô∏è‚É£: Prepare feature matrix ---
course_features = scaled_df[[
    'course_name_enc', 'instructor_enc', 'difficulty_level_enc',
    'enrollment_numbers', 'course_price', 'course_duration_hours',
    'feedback_score', 'time_spent_hours'
]].drop_duplicates(subset=['course_name_enc']).reset_index(drop=True)

# Extract feature matrix (everything except course_name_enc)
feature_matrix = course_features.drop(columns=['course_name_enc']).values

# 2Ô∏è‚É£ Function: Generate recommendations
def recommend_cbf(input_features, n=10, feature_weights=None):
    """
    Recommends top-N courses similar to user‚Äôs input profile.

    Parameters:
    -----------
    input_features : dict
        Feature values of the user's interests.
    n : int
        Number of recommendations.
    feature_weights : list or np.ndarray
        Optional weights to emphasize certain features.

    Returns:
    --------
    pd.DataFrame with top-N recommended courses and similarity scores.
    """

    # Convert input features to array
    user_vector = np.array([[
        input_features['instructor_enc'],
        input_features['difficulty_level_enc'],
        input_features['enrollment_numbers'],
        input_features['course_price'],
        input_features['course_duration_hours'],
        input_features['feedback_score'],
        input_features['time_spent_hours']
    ]])

    # Apply feature weights (if any)
    if feature_weights is not None:
        user_vector = user_vector * feature_weights
        weighted_features = feature_matrix * feature_weights
    else:
        weighted_features = feature_matrix

    # Compute cosine similarity
    sims = cosine_similarity(user_vector, weighted_features)[0]

    # Get top-N similar courses
    top_indices = sims.argsort()[-n:][::-1]
    results = course_features.iloc[top_indices][['course_name_enc']].copy()
    results['similarity_score'] = sims[top_indices]

    # üß† Add original course names
    course_lookup = scaled_df[['course_name_enc', 'course_name']].drop_duplicates()
    results = results.merge(course_lookup, on='course_name_enc', how='left')

    # Arrange columns nicely
    results = results[['course_name_enc', 'course_name', 'similarity_score']]

    return results.reset_index(drop=True)

# 3Ô∏è‚É£ Evaluation Metrics for CBF
def evaluate_cbf(user_id, k=10, feature_weights=None):
    """
    Evaluates CBF recommendations for a user by comparing recommended
    courses with courses actually taken or rated highly by the user.
    """

    # Get user‚Äôs profile (average of features of courses they liked/took)
    user_courses = scaled_df[scaled_df['user_id'] == user_id]
    if user_courses.empty:
        return np.nan, np.nan, np.nan

    # Construct average profile
    input_features = {
        'instructor_enc': user_courses['instructor_enc'].mean(),
        'difficulty_level_enc': user_courses['difficulty_level_enc'].mean(),
        'enrollment_numbers': user_courses['enrollment_numbers'].mean(),
        'course_price': user_courses['course_price'].mean(),
        'course_duration_hours': user_courses['course_duration_hours'].mean(),
        'feedback_score': user_courses['feedback_score'].mean(),
        'time_spent_hours': user_courses['time_spent_hours'].mean(),
    }

    # Get top-N recommendations
    recs = recommend_cbf(input_features, n=k, feature_weights=feature_weights)
    recommended_courses = set(recs['course_name_enc'])

    # True relevant courses (based on what user actually took)
    true_courses = set(user_courses['course_name_enc'])

    if not true_courses:
        return np.nan, np.nan, np.nan

    # Compute metrics
    hits = len(true_courses.intersection(recommended_courses))
    precision = hits / k
    recall = hits / len(true_courses)

    # For NDCG@K (ranking quality)
    y_true = np.isin(course_features['course_name_enc'], list(true_courses)).astype(int)
    y_score = np.zeros_like(y_true, dtype=float)
    for idx, cname in enumerate(course_features['course_name_enc']):
        if cname in recs['course_name_enc'].values:
            y_score[idx] = recs.loc[recs['course_name_enc'] == cname, 'similarity_score'].values[0]
    ndcg = ndcg_score([y_true], [y_score])

    return precision, recall, ndcg

# 4Ô∏è‚É£ Lightweight Hyperparameter Tuning (Feature Weighting)
param_grid = [
    [1, 1, 1, 1, 1, 1, 1],          # baseline (equal weights)
    [1, 1, 1, 0.8, 1.2, 1.2, 1.5],  # emphasize time & feedback
    [1, 1, 1, 1.2, 0.8, 1.5, 1.5],  # emphasize feedback & time_spent
    [1, 1, 1, 1.5, 1, 1.5, 1.5],    # more on engagement features
]

sample_users = np.random.choice(scaled_df['user_id'].unique(), size=30, replace=False)
results = []

for weights in param_grid:
    precisions, recalls, ndcgs = [], [], []
    for uid in tqdm(sample_users, desc=f"Testing weights={weights}"):
        p, r, n = evaluate_cbf(uid, k=10, feature_weights=weights)
        if not np.isnan(p): precisions.append(p)
        if not np.isnan(r): recalls.append(r)
        if not np.isnan(n): ndcgs.append(n)

    results.append({
        'weights': weights,
        'precision': np.mean(precisions),
        'recall': np.mean(recalls),
        'ndcg': np.mean(ndcgs)
    })

results_df = pd.DataFrame(results).sort_values('precision', ascending=False)
print("\nüèÜ Best feature weight combination found:")
print(results_df.head(3))

best_weights = results_df.iloc[0]['weights']

# 5Ô∏è‚É£ Final Evaluation with Optimal Weights
sample_users_final = np.random.choice(scaled_df['user_id'].unique(), size=50, replace=False)
precisions, recalls, ndcgs = [], [], []

for uid in tqdm(sample_users_final, desc="Final CBF Evaluation"):
    p, r, n = evaluate_cbf(uid, k=10, feature_weights=best_weights)
    if not np.isnan(p): precisions.append(p)
    if not np.isnan(r): recalls.append(r)
    if not np.isnan(n): ndcgs.append(n)

print("\n‚úÖ Final CBF Model Evaluation Results:")
print(f"Precision@10: {np.mean(precisions):.4f}")
print(f"Recall@10:    {np.mean(recalls):.4f}")
print(f"NDCG@10:      {np.mean(ndcgs):.4f}")

# 6Ô∏è‚É£ Example: Personalized Recommendation
example_user = scaled_df['user_id'].iloc[0]
user_profile = scaled_df[scaled_df['user_id'] == example_user].iloc[0]

input_features = {
    'instructor_enc': user_profile['instructor_enc'],
    'difficulty_level_enc': user_profile['difficulty_level_enc'],
    'enrollment_numbers': user_profile['enrollment_numbers'],
    'course_price': user_profile['course_price'],
    'course_duration_hours': user_profile['course_duration_hours'],
    'feedback_score': user_profile['feedback_score'],
    'time_spent_hours': user_profile['time_spent_hours']
}

recommendations = recommend_cbf(input_features, n=10, feature_weights=best_weights)
print(f"\nüéØ Top 10 CBF Recommendations for User {example_user}:\n")
print(recommendations)


### 3. HYBRID RECOMMENDER

In [None]:
# HYBRID RECOMMENDER WITH AUTOMATIC WEIGHT TUNING + EVALUATION

import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import ndcg_score
from tqdm import tqdm

# 1Ô∏è‚É£ Hybrid Recommendation Function
def recommend_hybrid(user_id, input_features, n=10, w_cf=0.6, w_cbf=0.4):
    """
    Hybrid recommender that fuses:
      - Collaborative Filtering (ALS model)
      - Content-Based Filtering (feature similarity)
    Weighted fusion with normalized ranking for stability.

    Args:
        user_id: target user for recommendations
        input_features: user‚Äôs course interest profile
        n: number of recommendations to return
        w_cf, w_cbf: weights for CF and CBF components
    """

    # --- Validate user ---
    if user_id not in user_map:
        return "User not found."

    uid = user_map[user_id]

    # --- CF Recommendations ---
    cf_recs = model_cf_final.recommend(uid, interaction_matrix[uid], N=50)

    # Handle implicit version differences
    if isinstance(cf_recs, tuple):  # newer implicit
        item_ids, scores = cf_recs
        cf_courses = [item_inv[i] for i in item_ids]
        cf_scores = np.array(scores)
    else:  # older implicit
        cf_courses = [item_inv[i] for i, _ in cf_recs]
        cf_scores = np.array([s for _, s in cf_recs])

    df_cf_scores = pd.DataFrame({'course_name_enc': cf_courses, 'cf_score': cf_scores})
    df_cf_scores['cf_score_norm'] = df_cf_scores['cf_score'].rank(ascending=False, pct=True)

    # --- CBF Recommendations ---
    user_vector = np.array([[
        input_features['instructor_enc'],
        input_features['difficulty_level_enc'],
        input_features['enrollment_numbers'],
        input_features['course_price'],
        input_features['course_duration_hours'],
        input_features['feedback_score'],
        input_features['time_spent_hours']
    ]])

    sims = cosine_similarity(user_vector, feature_matrix)[0]
    df_cbf_scores = pd.DataFrame({
        'course_name_enc': course_features['course_name_enc'].values,
        'cbf_score': sims
    })
    df_cbf_scores['cbf_score_norm'] = df_cbf_scores['cbf_score'].rank(ascending=False, pct=True)

    # --- Combine CF + CBF ---
    df_hybrid = pd.merge(df_cf_scores, df_cbf_scores, on='course_name_enc', how='outer').fillna(0)
    df_hybrid['hybrid_score'] = (
        w_cf * df_hybrid['cf_score_norm'] + w_cbf * df_hybrid['cbf_score_norm']
    )

    df_hybrid = df_hybrid.drop_duplicates(subset=['course_name_enc'])
    df_hybrid = df_hybrid.sort_values('hybrid_score', ascending=False).reset_index(drop=True)

    # üß† Add actual course names
    course_lookup = scaled_df[['course_name_enc', 'course_name']].drop_duplicates()
    df_hybrid = df_hybrid.merge(course_lookup, on='course_name_enc', how='left')

    # ‚úÖ Keep a clean display
    df_hybrid = df_hybrid[['course_name_enc', 'course_name', 'cf_score', 'cbf_score', 'hybrid_score']]

    return df_hybrid.head(n)


# 2Ô∏è‚É£ Evaluation Function for One User
def evaluate_user(user_id, input_features, k=10, w_cf=0.6, w_cbf=0.4):
    """
    Evaluate hybrid recommendations using:
      - Precision@K
      - Recall@K
      - NDCG@K
    """

    recs = recommend_hybrid(user_id, input_features, n=k, w_cf=w_cf, w_cbf=w_cbf)
    if isinstance(recs, str):  # if user not found
        return np.nan, np.nan, np.nan

    recommended = set(recs['course_name_enc'])
    user_data = scaled_df[scaled_df['user_id'] == user_id]
    true_items = set(user_data['course_name_enc'])

    if not true_items:
        return np.nan, np.nan, np.nan

    # --- Compute metrics ---
    hits = len(true_items.intersection(recommended))
    precision = hits / k
    recall = hits / len(true_items)

    # --- NDCG ---
    y_true = np.isin(course_features['course_name_enc'], list(true_items)).astype(int)
    y_score = np.zeros_like(y_true, dtype=float)
    for idx, cname in enumerate(course_features['course_name_enc']):
        if cname in recs['course_name_enc'].values:
            y_score[idx] = recs.loc[recs['course_name_enc'] == cname, 'hybrid_score'].values[0]
    ndcg = ndcg_score([y_true], [y_score]) if np.sum(y_true) > 0 else np.nan

    return precision, recall, ndcg


# 3Ô∏è‚É£ Hyperparameter Tuning (Find Optimal CF‚ÄìCBF Weights)
def tune_hybrid_weights(user_sample, input_features, w_values=np.linspace(0.1, 0.9, 9)):
    """
    Small grid search to optimize hybrid weight (CF vs CBF) for best Precision@K
    """

    results = []
    for w_cf in tqdm(w_values, desc="Tuning Hybrid Weights"):
        precisions, recalls, ndcgs = [], [], []
        for uid in user_sample:
            p, r, n = evaluate_user(uid, input_features, k=10, w_cf=w_cf, w_cbf=1 - w_cf)
            if not np.isnan(p): precisions.append(p)
            if not np.isnan(r): recalls.append(r)
            if not np.isnan(n): ndcgs.append(n)

        results.append({
            'w_cf': w_cf,
            'w_cbf': 1 - w_cf,
            'precision': np.mean(precisions),
            'recall': np.mean(recalls),
            'ndcg': np.mean(ndcgs)
        })

    results_df = pd.DataFrame(results).sort_values('precision', ascending=False)
    best_row = results_df.iloc[0]
    print("\nüèÜ Optimal Hybrid Weights Found:")
    print(best_row)

    return best_row['w_cf'], best_row['w_cbf'], results_df


# 4Ô∏è‚É£ Evaluate & Tune Model
# Example user input (profile or interest vector)
user_input = {
    'instructor_enc': 0,
    'difficulty_level_enc': 1,
    'enrollment_numbers': 0.306893,
    'course_price': 0.043729,
    'course_duration_hours': 0.329474,
    'feedback_score': 0.348930,
    'time_spent_hours': 0.336380
}

# Pick sample users for tuning
sample_users = np.random.choice(scaled_df['user_id'].unique(), size=30, replace=False)

# Tune optimal CF‚ÄìCBF weights
best_w_cf, best_w_cbf, tuning_results = tune_hybrid_weights(sample_users, user_input)


# 5Ô∏è‚É£ Final Model Evaluation with Optimal Weights
sample_users_final = np.random.choice(scaled_df['user_id'].unique(), size=40, replace=False)
precisions, recalls, ndcgs = [], [], []

for uid in tqdm(sample_users_final, desc="Final Hybrid Evaluation"):
    p, r, n = evaluate_user(uid, user_input, k=10, w_cf=best_w_cf, w_cbf=best_w_cbf)
    if not np.isnan(p): precisions.append(p)
    if not np.isnan(r): recalls.append(r)
    if not np.isnan(n): ndcgs.append(n)

print("\n‚úÖ Final Hybrid Model Performance:")
print(f"Precision@10: {np.mean(precisions):.4f}")
print(f"Recall@10:    {np.mean(recalls):.4f}")
print(f"NDCG@10:      {np.mean(ndcgs):.4f}")

# 6Ô∏è‚É£ Example Recommendation for One User
# Use one of the users from final evaluation
example_user = sample_users_final[0]
recommendations = recommend_hybrid(example_user, user_input, n=10, w_cf=best_w_cf, w_cbf=best_w_cbf)

print(f"\nüéØ Top 10 Optimized Hybrid Recommendations for User {example_user}:\n")
print(recommendations)


Your hybrid model retrieves all relevant courses (Recall = 1.0), ranks them very well (NDCG = 0.82), but only about 2‚Äì3 of the top 10 are exact matches (Precision = 0.23).

In [None]:
# ‚úÖ Save all necessary artifacts for deployment (v2)

import pickle, os, shutil

# 1Ô∏è‚É£ Create deployment directory (new version)
os.makedirs("deploy_bundle_1", exist_ok=True)

# 2Ô∏è‚É£ Save the trained CF model and content-based assets
pickle.dump(model_cf_final, open("deploy_bundle_1/model_cf.pkl", "wb"))
pickle.dump(feature_matrix, open("deploy_bundle_1/feature_matrix.pkl", "wb"))
pickle.dump(course_features, open("deploy_bundle_1/course_features.pkl", "wb"))

# 3Ô∏è‚É£ Save collaborative filtering mappings
pickle.dump(user_map, open("deploy_bundle_1/user_map.pkl", "wb"))
pickle.dump(item_map, open("deploy_bundle_1/item_map.pkl", "wb"))
pickle.dump(user_inv, open("deploy_bundle_1/user_inv.pkl", "wb"))
pickle.dump(item_inv, open("deploy_bundle_1/item_inv.pkl", "wb"))
pickle.dump(interaction_matrix, open("deploy_bundle_1/interaction_matrix.pkl", "wb"))

# 4Ô∏è‚É£ Save hybrid tuning weights
pickle.dump({"w_cf": best_w_cf, "w_cbf": best_w_cbf}, open("deploy_bundle_1/best_weights.pkl", "wb"))

# 5Ô∏è‚É£ Include label encoders and scaler if they exist
try:
    pickle.dump(le_course, open("deploy_bundle_1/le_course.pkl", "wb"))
    pickle.dump(le_instructor, open("deploy_bundle_1/le_instructor.pkl", "wb"))
    pickle.dump(le_difficulty, open("deploy_bundle_1/le_difficulty.pkl", "wb"))
    pickle.dump(scaler, open("deploy_bundle_1/minmax_scaler.pkl", "wb"))
    print("‚úÖ Encoders & scaler saved.")
except NameError:
    print("‚ÑπÔ∏è Encoders/scaler not found ‚Äî skipping (safe to ignore if not used).")

# 6Ô∏è‚É£ Confirm completion
print("‚úÖ All core artifacts saved in 'deploy_bundle_1'.")

# 7Ô∏è‚É£ Create a zip archive for deployment (new version)
shutil.make_archive("hybrid_recommender_bundle_1", "zip", "deploy_bundle_1")
print("üì¶ Created 'hybrid_recommender_bundle_1.zip' for deployment.")