In [1]:
import pandas as pd
from datetime import datetime

- Number of courses enrolled in: This could indicate the student's level of commitment and interest.
- Last active time: Indicates how recently the student engaged with the platform, potentially reflecting their current interest.
- Demographic Information: Age, gender, location: Depending on privacy concerns and available data, these attributes might provide insights into user behavior.

In [2]:
student_info_df = pd.read_csv('data/raw/365_student_info.csv')
course_ratings_df = pd.read_csv('data/raw/365_course_ratings.csv')
student_engagement_df = pd.read_csv('data/raw/365_student_engagement.csv')
student_purchases_df = pd.read_csv('data/raw/365_student_purchases.csv')

In [3]:
student_engagement_df.date_engaged = pd.to_datetime(student_engagement_df.date_engaged)
student_info_df.date_registered = pd.to_datetime(student_info_df.date_registered)
student_purchases_df.date_purchased = pd.to_datetime(student_purchases_df.date_purchased)

In [4]:
# We assume that if the course rating is above 4, it means that a student participated in it.

In [5]:
def num_course_rolled(student_id: str):
    rating_threshold = 4
    return course_ratings_df[(course_ratings_df.student_id == student_id) & (course_ratings_df.course_rating >= rating_threshold)].course_id.nunique()

course_ratings_df['enrolled_in'] = course_ratings_df.student_id.map(num_course_rolled)

In [6]:
df = pd.merge(student_info_df.student_id, course_ratings_df[['student_id', 'enrolled_in']], on='student_id', how='left')
df = df.fillna(0)
df = df.drop_duplicates(subset='student_id')

df

Unnamed: 0,student_id,rolled_in
0,258798,0.0
1,258799,0.0
2,258800,0.0
3,258801,0.0
4,258802,0.0
...,...,...
36438,295511,0.0
36439,295512,0.0
36440,295513,0.0
36441,295514,0.0


In [7]:
student_info_df

Unnamed: 0,student_id,student_country,date_registered
0,258798,IN,2022-01-01
1,258799,CO,2022-01-01
2,258800,CA,2022-01-01
3,258801,IN,2022-01-01
4,258802,US,2022-01-01
...,...,...,...
35225,295511,CM,2022-10-20
35226,295512,CA,2022-10-20
35227,295513,SA,2022-10-20
35228,295514,NL,2022-10-20


In [8]:
df = student_engagement_df.drop_duplicates('student_id', keep='last')[['student_id', 'date_engaged']]

df['latest_date_recorded'] = datetime(2022, 10, 20).date()
df.latest_date_recorded = pd.to_datetime(df.latest_date_recorded)

df['days_since_inactivity'] = (df.latest_date_recorded - df.date_engaged).dt.days

df

Unnamed: 0,student_id,date_engaged,latest_date_recorded,days_since_inactivity
22,275178,2022-05-27,2022-10-20,146
27,275180,2022-05-27,2022-10-20,146
31,275184,2022-05-27,2022-10-20,146
51,258804,2022-01-01,2022-10-20,292
57,258811,2022-01-13,2022-10-20,280
...,...,...,...,...
65366,295185,2022-10-20,2022-10-20,0
65367,282935,2022-10-20,2022-10-20,0
65368,292048,2022-10-20,2022-10-20,0
65369,295443,2022-10-20,2022-10-20,0


In [16]:
df = student_engagement_df.groupby('student_id').size().reset_index()
df

Unnamed: 0,student_id,0
0,258798,1
1,258800,29
2,258803,19
3,258804,1
4,258807,1
...,...,...
18339,295509,1
18340,295510,1
18341,295511,1
18342,295512,1


In [15]:
student_purchases_df[student_purchases_df.student_id == 284482]

Unnamed: 0,purchase_id,student_id,purchase_type,date_purchased
2338,21724,284482,Monthly,2022-08-19
2614,22324,284482,Monthly,2022-09-19
2623,22339,284482,Annual,2022-09-19
