<a href="https://colab.research.google.com/github/AmmarJamshed/saved-work/blob/main/ML_EDu_enrollment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
# Importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

In [16]:
# Load the Edx dataset
df = pd.read_csv('/content/edx_courses.csv')

# Displaying first few rows of the dataset
print(df.head())

# Selecting relevant features for prediction
# Assume these columns exist in the dataset based on your description
features = ['course_effort', 'length', 'price', 'Level', 'subject', 'course_type']
target = 'enrolled_students'  # This is the target variable we're predicting


                                               title  \
0                                How to Learn Online   
1  Programming for Everybody (Getting Started wit...   
2            CS50's Introduction to Computer Science   
3                                 The Analytics Edge   
4  Marketing Analytics: Marketing Measurement Str...   

                                             summary n_enrolled  \
0  Learn essential strategies for successful onli...    124,980   
1  This course is a "no prerequisite" introductio...    293,864   
2  An introduction to the intellectual enterprise...  2,442,271   
3  Through inspiring examples and stories, discov...    129,555   
4     This course is part of a MicroMasters® Program     81,140   

                           course_type                            institution  \
0              Self-paced on your time                                    edX   
1              Self-paced on your time             The University of Michigan   
2              Se

In [17]:
# Fill missing values in 'n_enrolled' with 0 (assuming courses with no enrollment data have 0 enrollments)
df['n_enrolled'] = df['n_enrolled'].str.replace(',', '').fillna(0).astype(int)

# Continue with the rest of the data cleaning steps

# Convert 'course_length' to numeric (extracting the number of weeks)
df['course_length'] = df['course_length'].str.extract('(\d+)').astype(float)

# Extract the minimum effort per week from 'course_effort'
df['course_effort'] = df['course_effort'].str.extract('(\d+)').astype(float)

# Extract the numeric part of 'price' (we'll use the price of the certificate if applicable)
df['price'] = df['price'].str.extract('(\d+)').astype(float)

# Handle missing values in price (assume free for missing prices)
df['price'].fillna(0, inplace=True)

# Create dummy variables for categorical features
df = pd.get_dummies(df, columns=['Level', 'subject', 'course_type'], drop_first=True)

# Prepare feature set and target
features = ['course_effort', 'course_length', 'price'] + [col for col in df.columns if 'Level_' in col or 'subject_' in col or 'course_type_' in col]
X = df[features]
y = df['n_enrolled']

# Check the cleaned dataset
X.head(), y.head()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['price'].fillna(0, inplace=True)


(   course_effort  course_length  price  Level_Intermediate  \
 0            2.0            2.0   49.0               False   
 1            2.0            7.0   49.0               False   
 2            6.0           12.0   90.0               False   
 3           10.0           13.0  199.0                True   
 4            5.0            4.0  249.0               False   
 
    Level_Introductory  subject_Art & Culture  subject_Biology & Life Sciences  \
 0                True                  False                            False   
 1                True                  False                            False   
 2                True                  False                            False   
 3               False                  False                            False   
 4                True                  False                            False   
 
    subject_Business & Management  subject_Chemistry  subject_Communication  \
 0                          False              

In [18]:
# Split the dataset into training and testing sets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the feature variables
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize the Linear Regression model
model = LinearRegression()

# Train the model on the training set
model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test_scaled)

# Evaluate the model using Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)
mse, y_pred[:5], y_test[:5]  # Return the MSE and first 5 predictions vs actual values for review

(1.8097291534993742e+36,
 array([55866.14200614, 14906.14200614, 48698.14200614, 90682.14200614,
        47162.14200614]),
 199    66764
 789        0
 174    37126
 467     9503
 66     48938
 Name: n_enrolled, dtype: int64)