In [1]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

# Load the dataset
df = pd.read_csv("Courses.csv")

In [2]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

# Load the dataset
df = pd.read_csv("Courses.csv")

# 1. Remove specified columns
columns_to_remove = ['nplay_video', 'incomplete_flag', 'roles']
df_cleaned = df.drop(columns=columns_to_remove)

# 2. Handle missing values
# For categorical variables (LoE_DI) - impute with mode
df_cleaned['LoE_DI'] = df_cleaned['LoE_DI'].fillna(df_cleaned['LoE_DI'].mode()[0])

# For gender in random way
missing_indices = df_cleaned[df_cleaned['gender'].isnull()].index
df_cleaned.loc[missing_indices, 'gender'] = np.random.choice(df_cleaned['gender'].dropna(), len(missing_indices))

# For YoB - impute with median
df_cleaned['YoB'] = df_cleaned['YoB'].fillna(df_cleaned['YoB'].median())

# 3. Drop rows with missing grades
df_cleaned = df_cleaned.dropna(subset=['grade'])

# 5. Define function for nchapters imputation
def impute_nchapters_simple(df):
    # Create copy of original nchapters
    df['nchapters_imputed'] = df['nchapters'].copy()
    
    # Imputation logic based on certification and activity days
    mask = df['nchapters'].isna()
    
    conditions = [
        # Condition 1: Certified students
        (mask) & (df['certified'] == 1),
        # Condition 2: Not certified but active (ndays_act > 3)
        (mask) & (df['certified'] == 0) & (df['ndays_act'] > 3),
        # Condition 3: Not certified and some activity (ndays_act <= 3)
        (mask) & (df['certified'] == 0) & (df['ndays_act'] <= 3),
        # Condition 4: No activity recorded (ndays_act is NaN) but viewed course
        (mask) & (df['ndays_act'].isna()) & (df['viewed'] == 1),
        # Condition 5: No activity and never viewed (complete non-engagement)
        (mask) & (df['ndays_act'].isna()) & (df['viewed'] == 0)
    ]
    
    values = [
        16,  # Average for certified students
        3,   # Average for non-certified active students
        1,   # Minimal engagement
        1,   # Viewed but no sustained activity
        0    # Never engaged with content
    ]
    
    # Set random seed for reproducibility
    np.random.seed(42)
    
    # Apply imputation
    df['nchapters_imputed'] = np.select(conditions, values, df['nchapters_imputed'])
    
    return df['nchapters_imputed']

# Apply nchapters imputation
df_cleaned['nchapters_imputed'] = impute_nchapters_simple(df_cleaned)

# 6. nevents missing data
def impute_nevents(row):
    if pd.isna(row['nevents']):  # Only impute if the value is missing
        if row['nchapters'] == 0:
            return 0
        elif row['nchapters'] == 1:
            return np.random.choice([0, 1], p=[0.7, 0.3])
        elif 2 <= row['nchapters'] <= 5:
            return np.random.choice([0, 1, 2], p=[0.5, 0.3, 0.2])
        elif 6 <= row['nchapters'] <= 12:
            return np.random.choice([1, 2, 3], p=[0.4, 0.4, 0.2])
        else:
            return np.random.choice([2, 3, 4], p=[0.3, 0.4, 0.3])
    else:
        return row['nevents']  # Return original value if not missing

# Set random seed for reproducibility
np.random.seed(42)

# Create new column with imputed values
df_cleaned['nevents_imputed'] = df_cleaned.apply(impute_nevents, axis=1)

# 7. Save the cleaned dataset with all changes
df_cleaned.to_csv('cleaned_dataset.csv', index=False)


In [3]:
import pandas as pd
import numpy as np

# Load the cleaned dataset
df_cleaned = pd.read_csv('cleaned_dataset.csv')

# Convert to numeric and handle NaN values
df_cleaned['viewed'] = pd.to_numeric(df_cleaned['viewed'], errors='coerce').fillna(0).astype(int)
df_cleaned['explored'] = pd.to_numeric(df_cleaned['explored'], errors='coerce').fillna(0).astype(int)
df_cleaned['certified'] = pd.to_numeric(df_cleaned['certified'], errors='coerce').fillna(0).astype(int)
df_cleaned['grade'] = pd.to_numeric(df_cleaned['grade'], errors='coerce').fillna(0)
df_cleaned['nevents_imputed'] = pd.to_numeric(df_cleaned['nevents_imputed'], errors='coerce').fillna(0)
df_cleaned['nchapters_imputed'] = pd.to_numeric(df_cleaned['nchapters_imputed'], errors='coerce').fillna(0)

# Convert ID columns to strings
df_cleaned['userid_DI'] = df_cleaned['userid_DI'].astype(str)
df_cleaned['course_id'] = df_cleaned['course_id'].astype(str)

# Save the updated dataset
df_cleaned.to_csv('cleaned_dataset.csv', index=False)

# Let's analyze the dataset
print("\n=== Dataset Overview ===")
print(f"Number of rows: {len(df_cleaned)}")
print(f"Number of columns: {len(df_cleaned.columns)}")
print("\n=== Data Types ===")
print(df_cleaned.dtypes)

print("\n=== Basic Statistics ===")
print(df_cleaned.describe())

print("\n=== Missing Values ===")
print(df_cleaned.isnull().sum())

print("\n=== Sample of First Few Rows ===")
print(df_cleaned.head())

# Additional analysis for binary columns
binary_cols = ['viewed', 'explored', 'certified']
print("\n=== Binary Columns Distribution ===")
for col in binary_cols:
    print(f"\n{col} value counts:")
    print(df_cleaned[col].value_counts(normalize=True).round(3) * 100, "%")



=== Dataset Overview ===
Number of rows: 592766
Number of columns: 19

=== Data Types ===
course_id             object
userid_DI             object
registered             int64
viewed                 int32
explored               int32
certified              int32
final_cc_cname_DI     object
LoE_DI                object
YoB                  float64
gender                object
grade                float64
start_time_DI         object
last_event_DI         object
nevents              float64
ndays_act            float64
nchapters            float64
nforum_posts           int64
nchapters_imputed    float64
nevents_imputed      float64
dtype: object

=== Basic Statistics ===
       registered         viewed       explored      certified            YoB  \
count    592766.0  592766.000000  592766.000000  592766.000000  592766.000000   
mean          1.0       0.631195       0.065592       0.029838    1985.637746   
std           0.0       0.482481       0.247569       0.170141       8.2543

In [4]:
pip install pyspark




In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
from pyspark.sql import functions as F

# Initialize Spark Session
spark = SparkSession.builder \
    .appName("CourseRecommendationALS") \
    .getOrCreate()

# Convert pandas DataFrame to Spark DataFrame
spark_df = spark.createDataFrame(data)

# Rename columns to match Spark's ALS expected column names
spark_df = spark_df.withColumnRenamed('user_index', 'userId')
spark_df = spark_df.withColumnRenamed('course_index', 'itemId')
spark_df = spark_df.withColumnRenamed('nevents_imputed', 'rating')

# Ensure columns are integer types
spark_df = spark_df.withColumn('userId', spark_df['userId'].cast('integer'))
spark_df = spark_df.withColumn('itemId', spark_df['itemId'].cast('integer'))
spark_df = spark_df.withColumn('rating', spark_df['rating'].cast('float'))

# Split data into training and test sets
(train, test) = spark_df.randomSplit([0.8, 0.2], seed=42)

# Initialize ALS model
als = ALS(
    maxIter=20,
    regParam=0.1,
    rank=20,
    userCol="userId",
    itemCol="itemId",
    ratingCol="rating",
    implicitPrefs=True,
    coldStartStrategy="drop"
)

# Train the model
model = als.fit(train)

# Generate top 5 course recommendations for each user
user_recs = model.recommendForAllUsers(5)

# Display recommendations
user_recs.show(5, truncate=False)

# Stop the Spark session
spark.stop()
