In [1]:
!python3 --version

Python 3.12.4


In [51]:
import pandas as pd

import pickle

from sklearn.feature_extraction import DictVectorizer

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import mean_squared_error

import mlflow

In [3]:
df = pd.read_csv('./data/online_course_engagement_data.csv')
df

Unnamed: 0,UserID,CourseCategory,TimeSpentOnCourse,NumberOfVideosWatched,NumberOfQuizzesTaken,QuizScores,CompletionRate,DeviceType,CourseCompletion
0,5618,Health,29.979719,17,3,50.365656,20.860773,1,0
1,4326,Arts,27.802640,1,5,62.615970,65.632415,1,0
2,5849,Arts,86.820485,14,2,78.458962,63.812007,1,1
3,4992,Science,35.038427,17,10,59.198853,95.433162,0,1
4,3866,Programming,92.490647,16,0,98.428285,18.102478,0,0
...,...,...,...,...,...,...,...,...,...
8995,8757,Health,37.445225,14,4,54.469359,32.990704,1,0
8996,894,Science,48.631443,7,7,59.413257,0.254625,0,0
8997,6323,Health,38.212512,3,3,69.508297,70.188159,1,0
8998,3652,Health,70.048665,13,10,79.655182,72.975225,1,1


In [4]:
df.shape

(9000, 9)

In [5]:
df.describe()

Unnamed: 0,UserID,TimeSpentOnCourse,NumberOfVideosWatched,NumberOfQuizzesTaken,QuizScores,CompletionRate,DeviceType,CourseCompletion
count,9000.0,9000.0,9000.0,9000.0,9000.0,9000.0,9000.0,9000.0
mean,4498.894556,50.163822,10.024667,5.090556,74.706028,50.340146,0.500667,0.396444
std,2596.849433,28.49175,6.029878,3.157762,14.378383,28.950977,0.500027,0.489186
min,1.0,1.00523,0.0,0.0,50.005119,0.009327,0.0,0.0
25%,2251.75,25.440548,5.0,2.0,62.283451,25.653614,0.0,0.0
50%,4483.5,49.818417,10.0,5.0,74.743294,50.264124,1.0,0.0
75%,6751.25,75.069924,15.0,8.0,87.022663,75.572493,1.0,1.0
max,9000.0,99.992558,20.0,10.0,99.994984,99.979711,1.0,1.0


In [15]:
df_train = df[:6000]
df_train

Unnamed: 0,UserID,CourseCategory,TimeSpentOnCourse,NumberOfVideosWatched,NumberOfQuizzesTaken,QuizScores,CompletionRate,DeviceType,CourseCompletion
0,5618,Health,29.979719,17,3,50.365656,20.860773,1,0
1,4326,Arts,27.802640,1,5,62.615970,65.632415,1,0
2,5849,Arts,86.820485,14,2,78.458962,63.812007,1,1
3,4992,Science,35.038427,17,10,59.198853,95.433162,0,1
4,3866,Programming,92.490647,16,0,98.428285,18.102478,0,0
...,...,...,...,...,...,...,...,...,...
5995,2080,Science,25.605039,7,6,57.301319,23.932510,1,0
5996,1017,Health,37.116992,0,4,93.741189,70.426685,0,1
5997,8695,Science,92.607477,3,9,71.935087,54.513815,1,1
5998,6350,Arts,13.789208,8,5,59.714599,13.730824,0,0


In [54]:
df_train.to_csv('./data/online_course_engagement_train_data.csv')

In [16]:
df_val = df[6000:]
df_val

Unnamed: 0,UserID,CourseCategory,TimeSpentOnCourse,NumberOfVideosWatched,NumberOfQuizzesTaken,QuizScores,CompletionRate,DeviceType,CourseCompletion
6000,4465,Arts,62.738969,19,6,59.439384,46.700103,1,0
6001,746,Arts,12.262434,10,6,99.656095,91.740781,0,1
6002,5747,Science,51.139651,12,6,83.683911,4.592798,1,1
6003,2948,Programming,3.030581,13,8,83.971620,64.534325,1,1
6004,8971,Arts,96.493438,13,1,72.146767,79.496045,0,1
...,...,...,...,...,...,...,...,...,...
8995,8757,Health,37.445225,14,4,54.469359,32.990704,1,0
8996,894,Science,48.631443,7,7,59.413257,0.254625,0,0
8997,6323,Health,38.212512,3,3,69.508297,70.188159,1,0
8998,3652,Health,70.048665,13,10,79.655182,72.975225,1,1


In [55]:
df_val.to_csv('./data/online_course_engagement_val_data.csv')

In [17]:
df.columns

Index(['UserID', 'CourseCategory', 'TimeSpentOnCourse',
       'NumberOfVideosWatched', 'NumberOfQuizzesTaken', 'QuizScores',
       'CompletionRate', 'DeviceType', 'CourseCompletion'],
      dtype='object')

In [18]:
df.CourseCategory

0            Health
1              Arts
2              Arts
3           Science
4       Programming
           ...     
8995         Health
8996        Science
8997         Health
8998         Health
8999         Health
Name: CourseCategory, Length: 9000, dtype: object

In [19]:
df['DeviceType']

0       1
1       1
2       1
3       0
4       0
       ..
8995    1
8996    0
8997    1
8998    1
8999    0
Name: DeviceType, Length: 9000, dtype: int64

In [20]:
categorical = ['DeviceType', 'CourseCategory']
numerical = ['TimeSpentOnCourse', 'NumberOfVideosWatched', 'NumberOfQuizzesTaken']

In [21]:
train_dict = df_train[categorical + numerical].to_dict(orient='records')

In [22]:
val_dict = df_val[categorical + numerical].to_dict(orient='records')

In [23]:
dv = DictVectorizer()

In [24]:
X_train = dv.fit_transform(train_dict)
X_val = dv.fit_transform(val_dict)

In [25]:
target = 'CourseCompletion'
y_train = df_train[target].values
y_val = df_val[target].values

In [40]:
logr = LogisticRegression(max_iter = 10000)

In [46]:
logr.fit(X_train, y_train)

In [47]:
y_pred = logr.predict(X_val)
y_pred

array([1, 0, 0, ..., 0, 1, 0])

In [57]:
mean_squared_error(y_val, y_pred)

0.30566666666666664

In [49]:
with open('models/log_reg.bin', 'wb') as f_out:
    pickle.dump((dv, logr), f_out)

In [52]:
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("online-course-engagement-prediction-experiment-1")

2024/07/10 13:47:37 INFO mlflow.tracking.fluent: Experiment with name 'online-course-engagement-prediction-experiment-1' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/1', creation_time=1720615657983, experiment_id='1', last_update_time=1720615657983, lifecycle_stage='active', name='online-course-engagement-prediction-experiment-1', tags={}>

In [59]:
with mlflow.start_run():
     mlflow.set_tag("Developer", "Agnes")
     mlflow.log_param("train-data-path", "./data/online_course_engagement_train_data.csv")
     mlflow.log_param("valid-data-path", "./data/./data/online_course_engagement_val_data.csv")
     mlflow.log_param("C", 1)
     logr = LogisticRegression(max_iter = 10000)
     logr.fit(X_train, y_train)
    
     y_pred = logr.predict(X_val)
     rmse = mean_squared_error(y_val, y_pred)
     mlflow.log_metric("rmse", rmse)
    
     mlflow.log_artifact(local_path="models/log_reg.bin", artifact_path="models_pickle")