<a href="https://colab.research.google.com/github/AmmarJamshed/saved-work/blob/main/ML_EDu_enrollment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

# POC Framework of EDX dataset for online education platforms in Pakistan with similar features in their datasets.

## Load the Edx Data and we choose selected features which we believe are present in online platforms in Pakistan too which are:

['length', 'price', 'Level', 'subject', 'course_type']

In [None]:
# Load the Edx dataset
df = pd.read_csv('/content/edx_courses.csv')

# Displaying first few rows of the dataset
print(df.head())

# Selecting relevant features for prediction
# Assume these columns exist in the dataset based on your description
features = ['course_length', 'price', 'Level', 'subject', 'course_type']
target = 'n_enrolled'  # This is the target variable we're predicting


                                               title  \
0                                How to Learn Online   
1  Programming for Everybody (Getting Started wit...   
2            CS50's Introduction to Computer Science   
3                                 The Analytics Edge   
4  Marketing Analytics: Marketing Measurement Str...   

                                             summary n_enrolled  \
0  Learn essential strategies for successful onli...    124,980   
1  This course is a "no prerequisite" introductio...    293,864   
2  An introduction to the intellectual enterprise...  2,442,271   
3  Through inspiring examples and stories, discov...    129,555   
4     This course is part of a MicroMasters® Program     81,140   

                           course_type                            institution  \
0              Self-paced on your time                                    edX   
1              Self-paced on your time             The University of Michigan   
2              Se

- We checked the data

In [None]:
# check missing values
dfnew = df[features]
df.isnull().sum()
dfn = df[target]
dfn.isnull().sum()

120

- There were 120 Missing values in target variable -> 'n_enrolled'

# Transform columns and fill Missing values in n_enrolled



In [None]:
# Fill missing values in 'n_enrolled' with 0 (assuming courses with no enrollment data have 0 enrollments)
df['n_enrolled'] = df['n_enrolled'].str.replace(',', '').fillna(0).astype(int)

# Continue with the rest of the data cleaning steps

# Convert 'course_length' to numeric (extracting the number of weeks)
df['course_length'] = df['course_length'].str.extract('(\d+)').astype(float)

# Extract the minimum effort per week from 'course_effort'
df['course_effort'] = df['course_effort'].str.extract('(\d+)').astype(float)


# Extract the numeric part of 'price' (we'll use the price of the certificate if applicable)
df['price'] = df['price'].str.extract('(\d+)').astype(float)

# Handle missing values in price (assume free for missing prices)
df['price'].fillna(0, inplace=True)

# encoding categoricals
from sklearn.preprocessing import LabelEncoder

# Apply Label Encoding to each column
# Initialize LabelEncoder
label_encoder = LabelEncoder()
for column in ['Level', 'subject', 'course_type']:
    df[column] = label_encoder.fit_transform(df[column])

# Prepare feature set and target
features = ['course_length', 'price', 'Level', 'subject', 'course_type']
X = df[features]
y = df['n_enrolled']

# Check the cleaned dataset
X.head(), y.head()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['price'].fillna(0, inplace=True)


(   course_length  price  Level  subject  course_type
 0            2.0   49.0      2       10            1
 1            7.0   49.0      2        6            1
 2           12.0   90.0      2        6            1
 3           13.0  199.0      1        7            0
 4            4.0  249.0      2        6            1,
 0     124980
 1     293864
 2    2442271
 3     129555
 4      81140
 Name: n_enrolled, dtype: int64)

- We Transformed all feature variables and removed strings in them and then converted them to float.

## Scale the data and apply Linear Regression

In [None]:
# Split the dataset into training and testing sets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the feature variables
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
y_train_scaled = scaler.fit_transform(y_train.values.reshape(-1, 1))
y_test_scaled = scaler.transform(y_test.values.reshape(-1, 1))

# Initialize the Linear Regression model
model = LinearRegression()

# Train the model on the training set
model.fit(X_train_scaled, y_train_scaled)

# Make predictions on the test set
y_pred = model.predict(X_test_scaled)

# Evaluate the model using Mean Squared Error (MSE)
mse = mean_squared_error(y_test_scaled, y_pred)
# Evaluate the Random Forest model using Mean Squared Error (MSE)
lr_rmse = np.sqrt(mean_squared_error(y_test_scaled, y_pred))
lr_r2 = r2_score(y_test_scaled,  y_pred)
print(f" RMSE: {lr_rmse}")
print(f"R² Score: {lr_r2}")

 RMSE: 0.838433832665797
R² Score: 0.07707709778831318


- We scaled the data and then applied it to the Linear Regression as the ranges in each variable especially the target variable were too high.
- We achieved high RMSE and low r2 score with Linear Regression showing its not a best fit.

### Scale the Data and Apply Random Forest Regression  and Decision Tree Regression

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

# Initialize the Random Forest and Decision Tree models
random_forest_model = RandomForestRegressor(random_state=42)
decision_tree_model = DecisionTreeRegressor(random_state=42)

# Train the Random Forest model on the training set
random_forest_model.fit(X_train_scaled, y_train_scaled)

# Train the Decision Tree model on the training set
decision_tree_model.fit(X_train_scaled, y_train_scaled)

# Make predictions using Random Forest
rf_y_pred = random_forest_model.predict(X_test_scaled)

# Make predictions using Decision Tree
dt_y_pred = decision_tree_model.predict(X_test_scaled)

  return fit_method(estimator, *args, **kwargs)


In [None]:

# Evaluate the Random Forest model using Root Mean Squared Error (MSE)
rf_rmse = np.sqrt(mean_squared_error(y_test_scaled, rf_y_pred))
rf_r2 = r2_score(y_test_scaled,  rf_y_pred)
print(f" RMSE: {rf_rmse}")
print(f"R² Score: {rf_r2}")

 RMSE: 0.7654081763416066
R² Score: 0.23084471592011546


In [None]:
# Evaluate the Decision Tree model using Root Mean Squared Error (MSE)
dt_mse = mean_squared_error(y_test_scaled, dt_y_pred)
dt_r2 = r2_score(y_test_scaled, dt_y_pred)
print(f"Decision Tree MSE: {dt_mse}")
print(f"Decision Tree r2: {dt_r2}")

Decision Tree MSE: 1.8340234736653933
Decision Tree r2: -1.4078682684253123


- We achieved high RMSE and low r2 score with Random Forest and Decision Tree but Decision Tree performed poorly while Random Forest yeilded some level of accuracy in comparison to both decision tree and Linear Regression.

### Scale the Data and Apply XGboost

In [None]:
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.compose import ColumnTransformer

# XGBoost pipeline
xgb_model = Pipeline(steps=[
    ("regressor", XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42))
])

# Train the model
xgb_model.fit(X_train_scaled, y_train_scaled)

# Make predictions
xgb_preds = xgb_model.predict(X_test_scaled)

# Evaluate the model
xgb_rmse = np.sqrt(mean_squared_error(y_test_scaled, xgb_preds))
xgb_r2 = r2_score(y_test_scaled, xgb_preds)
print(f"XGBoost RMSE: {xgb_rmse}")
print(f"XGBoost R² Score: {xgb_r2}")

XGBoost RMSE: 1.0151295592319989
XGBoost R² Score: -0.3529158404233128


- Gradient Boosting did not perform as well as we hoped as it yielded negative r2 score and high RMSE.

### Scale the data and apply Neural Network

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

# Neural Network model
nn_model = Sequential([
    Dense(128, activation="relu", input_shape=(X_train_scaled.shape[1],)),
    Dropout(0.3),
    Dense(64, activation="relu"),
    Dropout(0.2),
    Dense(1)  # Single output for regression
])

# Compile the model
nn_model.compile(optimizer=Adam(learning_rate=0.001), loss="mse", metrics=["mse"])

# Train the model
history = nn_model.fit(X_train_scaled, y_train_scaled, validation_split=0.2, epochs=50, batch_size=32, verbose=0)

# Make predictions
nn_preds = nn_model.predict(X_test_scaled)

# Evaluate the model
nn_rmse = np.sqrt(mean_squared_error(y_test_scaled, nn_preds))
nn_r2 = r2_score(y_test_scaled, nn_preds)
print(f"Neural Network RMSE: {nn_rmse}")
print(f"Neural Network R² Score: {nn_r2}")

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
Neural Network RMSE: 0.8175172579024549
Neural Network R² Score: 0.12255138562003343


- Nueral Networks performed better than all  other models but did not surpass Random Forest Regressor.

| Model             | RMSE               | R² Score            |
|-------------------|--------------------|---------------------|
| Neural Network    | 0.8175172579024549 | 0.12255138562003343 |
| XGBoost           | 1.0151295592319989 | -0.3529158404233128 |
| Linear Regression | 0.838433832665797  | 0.07707709778831318 |
| Random Forest     | 0.7654081763416066  | 0.23084471592011546 |
| Decision Tree     | 1.8340234736653933 | -1.4078682684253123 |


# Hyper Param Tuning for Decision Tree and Random Forest

In [None]:
from sklearn.model_selection import GridSearchCV

# --- Random Forest Hyperparameter Tuning ---
rf_params = {
    'n_estimators': [50, 100, 200],            # Number of trees in the forest
    'max_depth': [None, 10, 20, 30],           # Maximum depth of each tree
    'min_samples_split': [2, 5, 10],           # Minimum number of samples required to split
    'min_samples_leaf': [1, 2, 4],             # Minimum number of samples per leaf
    'bootstrap': [True, False]                 # Whether bootstrap samples are used
}

random_forest_model = RandomForestRegressor(random_state=42)

# Perform Grid Search for Random Forest
rf_grid = GridSearchCV(estimator=random_forest_model, param_grid=rf_params,
                       scoring='neg_mean_squared_error', cv=5, verbose=2, n_jobs=-1)

rf_grid.fit(X_train_scaled, y_train_scaled)

# Best parameters for Random Forest
print("Best Random Forest Parameters:", rf_grid.best_params_)

# Evaluate the tuned Random Forest model
rf_best_model = rf_grid.best_estimator_
rf_y_pred = rf_best_model.predict(X_test_scaled)
rf_mse = mean_squared_error(y_test_scaled, rf_y_pred)
rf_r2 = r2_score(y_test_scaled, rf_y_pred)
print(f"Tuned Random Forest MSE: {rf_mse}")
print(f"Tuned Random Forest r2: {rf_r2}")

# --- Decision Tree Hyperparameter Tuning ---
dt_params = {
    'max_depth': [None, 10, 20, 30],           # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],           # Minimum number of samples required to split
    'min_samples_leaf': [1, 2, 4],             # Minimum number of samples per leaf
    'criterion': ['squared_error', 'absolute_error']  # Splitting criteria
}

decision_tree_model = DecisionTreeRegressor(random_state=42)

# Perform Grid Search for Decision Tree
dt_grid = GridSearchCV(estimator=decision_tree_model, param_grid=dt_params,
                       scoring='neg_mean_squared_error', cv=5, verbose=2, n_jobs=-1)

dt_grid.fit(X_train_scaled, y_train_scaled)

# Best parameters for Decision Tree
print("Best Decision Tree Parameters:", dt_grid.best_params_)

# Evaluate the tuned Decision Tree model
dt_best_model = dt_grid.best_estimator_
dt_y_pred = dt_best_model.predict(X_test_scaled)
dt_mse = mean_squared_error(y_test_scaled, dt_y_pred)
dt_r2 = r2_score(y_test_scaled, dt_y_pred)
print(f"Tuned Decision Tree MSE: {dt_mse}")
print(f"Tuned Decision Tree r2: {dt_r2}")

Fitting 5 folds for each of 216 candidates, totalling 1080 fits


  return fit_method(estimator, *args, **kwargs)


Best Random Forest Parameters: {'bootstrap': True, 'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 200}
Tuned Random Forest MSE: 0.5231898084238092
Tuned Decision Tree r2: 0.31311013399981913
Fitting 5 folds for each of 72 candidates, totalling 360 fits
Best Decision Tree Parameters: {'criterion': 'absolute_error', 'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10}
Tuned Decision Tree MSE: 0.7137660890582013
Tuned Decision Tree r2: 0.0629047329004706


- We were able to slightly improve MSE and r2 score with Hyper parameter tuning where max depth was a vital factor in both and we could have further improved this if we had access to more computational resources.

| Model              | Best Parameters                                                                                  | MSE               | r2 Score |
|--------------------|------------------------------------------------------------------------------------------------|-------------------|-------------------|
| Random Forest      | {'bootstrap': True, 'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 200} | 0.52 | 0.31 |
| Decision Tree      | {'criterion': 'absolute_error', 'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10} | 0.71 | 0.06 |


### Apply Diffusion model approach to existing models

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.preprocessing import StandardScaler

class DiffusionModeling:
    def __init__(self, X_train, X_test, y_train, y_test):
        # Ensure that data is in the correct format (numpy arrays)
        X_train = np.array(X_train)
        X_test = np.array(X_test)
        y_train = np.array(y_train).reshape(-1, 1)  # Reshaping to 2D array
        y_test = np.array(y_test).reshape(-1, 1)  # Reshaping to 2D array

        # Scaling the independent features (X) and the target variable (y)
        self.feature_scaler = StandardScaler()
        self.target_scaler = StandardScaler()

        # Scale independent features
        self.X_train = self.feature_scaler.fit_transform(X_train)
        self.X_test = self.feature_scaler.transform(X_test)

        # Scale target variable
        self.y_train = self.target_scaler.fit_transform(y_train)
        self.y_test = self.target_scaler.transform(y_test)

        self.models = {
            "Neural Network": self._neural_network,
            "XGBoost": XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42),
            "Linear Regression": LinearRegression(),
            "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
            "Decision Tree": DecisionTreeRegressor(random_state=42),
        }
        self.results = {}

    def apply_diffusion(self, n_synthetic=1000):
        # Create synthetic data
        synthetic_X = np.random.normal(self.X_train.mean(axis=0), self.X_train.std(axis=0), size=(n_synthetic, self.X_train.shape[1]))
        synthetic_y = np.random.normal(self.y_train.mean(), self.y_train.std(), size=n_synthetic).reshape(-1, 1)  # Reshaping to 2D

        # Stack the synthetic data with the original data
        self.X_train = np.vstack([self.X_train, synthetic_X])
        self.y_train = np.vstack([self.y_train, synthetic_y])  # Ensure y_train is 2D

    def evaluate_models(self):
        for name, model in self.models.items():
            if name == "Neural Network":
                preds, mse, r2 = model()
            else:
                model.fit(self.X_train, self.y_train)
                preds = model.predict(self.X_test)
                mse = mean_squared_error(self.y_test, preds)
                r2 = r2_score(self.y_test, preds)

            self.results[name] = {
                "MSE": mse,
                "R²": r2
            }

    def _neural_network(self):
        # Neural Network Model
        model = Sequential([
            Dense(64, activation="relu", input_shape=(self.X_train.shape[1],)),
            Dense(32, activation="relu"),
            Dense(1)
        ])
        model.compile(optimizer="adam", loss="mse")

        # Train the model
        model.fit(self.X_train, self.y_train, epochs=50, batch_size=32, verbose=0)

        # Predictions and evaluation
        preds = model.predict(self.X_test).flatten()
        mse = mean_squared_error(self.y_test, preds)
        r2 = r2_score(self.y_test, preds)
        return preds, mse, r2

    def display_results(self):
        print("| Model             | MSE               | R² Score            |")
        print("|-------------------|--------------------|---------------------|")
        for name, metrics in self.results.items():
            print(f"| {name:<17} | {metrics['MSE']:<18} | {metrics['R²']:<20} |")

# Example usage with preprocessed data (replace with actual dataset)
# X_train, X_test, y_train, y_test should be prepared before this step
# Ensure that X_train, X_test, y_train, y_test are numpy arrays (or properly converted from pandas DataFrames/Series)
diffusion_model = DiffusionModeling(X_train, X_test, y_train, y_test)
diffusion_model.apply_diffusion()
diffusion_model.evaluate_models()
diffusion_model.display_results()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step 


  return fit_method(estimator, *args, **kwargs)


| Model             | MSE               | R² Score            |
|-------------------|--------------------|---------------------|
| Neural Network    | 0.7722477566664627 | -0.01387517408572525 |
| XGBoost           | 0.49583239694288234 | 0.34902736404460555  |
| Linear Regression | 0.7228698261861444 | 0.050952541410440366 |
| Random Forest     | 0.5020833049634731 | 0.3408206189904951   |
| Decision Tree     | 0.7741825650174489 | -0.01641536165732349 |


- After a diffusion Model approach where we add synthetic data to increase the model's ability to capture data trends, we saw it improved XGBOOST and Random Forest but worsened other models.

# Challenges

- Limited computational resources.
- Limited data.
- We did not have a Pakistan platform specific dataset which is why we used Edx to create a proof of cocnept for education platforms in Pakistan who we noted may have similar variables for their online course marketplaces.