# Data exploring

In [1]:
import pandas as pd
import numpy as np

In [4]:
dfg = pd.read_csv("../data/elearning_dataset.csv")
dfg

Unnamed: 0,Date,UserID,CourseID,Event
0,2021-01-10 10:13:00,4048,455,view_course
1,2021-01-10 10:14:12,6162,742,view_course
2,2021-01-10 10:14:52,7852,652,view_course
3,2021-01-10 10:15:40,6162,742,rundown
4,2021-01-10 10:16:35,7852,652,teacher_profile
...,...,...,...,...
52404,2021-03-09 15:16:09,1143,3609,detailed_description
52405,2021-03-09 15:17:52,1273,245,view_course
52406,2021-03-09 15:19:43,1273,245,detailed_description
52407,2021-03-09 15:20:44,1273,245,institution


In [36]:
# detect basic information
dfg.Event.unique()
len(dfg.UserID.unique()) #7747个用户
len(dfg.CourseID.unique()) # 2076个课程
dfg.describe(include = 'O')

Unnamed: 0,Event
count,52409
unique,5
top,view_course
freq,27001


## Data cleaning

In [12]:
# 基本不需要清洗，都是干净的
dfg.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52409 entries, 0 to 52408
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Date      52409 non-null  object
 1   UserID    52409 non-null  int64 
 2   CourseID  52409 non-null  int64 
 3   Event     52409 non-null  object
dtypes: int64(2), object(2)
memory usage: 1.6+ MB


## Data preparing

In [19]:
#时间处理特别重要,获取月日属性
dfg['Date'] = pd.to_datetime(dfg['Date'])
dfg["Month"] = dfg['Date'].dt.month
dfg['Day'] = dfg['Date'].dt.day

In [32]:
groupbyEvent = dfg.groupby("Event").agg(['count'])
groupbyEvent.index.values.tolist()
#groupbyEvent[('Date', 'count')].values

['detailed_description',
 'institution',
 'rundown',
 'teacher_profile',
 'view_course']

# Data visualization

In [35]:
from pyecharts import options as opts
from pyecharts.charts import Bar

c = (
    Bar()
    .add_xaxis(groupbyEvent.index.values.tolist())
    .add_yaxis("Event",groupbyEvent[('Date', 'count')].values.tolist())
    .set_global_opts(
        xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=-15)),
        title_opts=opts.TitleOpts(title="用户事件分布", subtitle="-"),
    )
    .render("用户事件分布.html")
)

# Recommender

## Behavioural Implicit Ratings

Using the formula introduced during lecture

$${IR}_(i,u) = \left(w_1*{\#event}_1\right)+\left(w_2*{\#event}_2\right)+\dots+\left(w_n*{\#event}_n\right)$$

1. Create a user-course binary matrix

In [39]:
ucMatrix = pd.DataFrame(columns = dfg.CourseID.unique(),index = dfg.UserID.unique())
ucMatrix

Unnamed: 0,455,742,652,3694,86,1704,1804,433,3253,1103,...,2188,3554,869,3332,36,3227,3425,428,2702,3459
4048,,,,,,,,,,,...,,,,,,,,,,
6162,,,,,,,,,,,...,,,,,,,,,,
7852,,,,,,,,,,,...,,,,,,,,,,
5960,,,,,,,,,,,...,,,,,,,,,,
3823,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7674,,,,,,,,,,,...,,,,,,,,,,
3729,,,,,,,,,,,...,,,,,,,,,,
2933,,,,,,,,,,,...,,,,,,,,,,
257,,,,,,,,,,,...,,,,,,,,,,


2. Give a weight to each of event

In [40]:
eventWeights = {
    'detailed_description': 25,
    'institution': 25,
    'teacher_profile': 40,
    'rundown': 50,
    'view_course': 10}

In [38]:
dfg

Unnamed: 0,Date,UserID,CourseID,Event,Month,Day
0,2021-01-10 10:13:00,4048,455,view_course,1,10
1,2021-01-10 10:14:12,6162,742,view_course,1,10
2,2021-01-10 10:14:52,7852,652,view_course,1,10
3,2021-01-10 10:15:40,6162,742,rundown,1,10
4,2021-01-10 10:16:35,7852,652,teacher_profile,1,10
...,...,...,...,...,...,...
52404,2021-03-09 15:16:09,1143,3609,detailed_description,3,9
52405,2021-03-09 15:17:52,1273,245,view_course,3,9
52406,2021-03-09 15:19:43,1273,245,detailed_description,3,9
52407,2021-03-09 15:20:44,1273,245,institution,3,9


Compute the Implicit Rating for each users-courses combination.
Populate the users-courses matrix `ucMatrix` with the IR values.

In [41]:
# Iterate the evidence
for index,row in dfg.iterrows():
    # select the user and items involved
    currentUser = row['UserID']
    currentCourse = row['CourseID']
    currentEvent = row['Event']
    # Extract the appropriate weight for the event
    w = eventWeights[currentEvent]
    # Find the value eventually stored for the current users-courses combination
    currentValue = ucMatrix.at[currentUser, currentCourse]
    if np.isnan(currentValue):
        currentValue = 0        
    # Compute the new value and update the user-item matrix
    updatedValue = currentValue + w #+ (1 * w)
    ucMatrix.at[currentUser, currentCourse] = updatedValue

In [45]:
ucMatrix

Unnamed: 0,455,742,652,3694,86,1704,1804,433,3253,1103,...,2188,3554,869,3332,36,3227,3425,428,2702,3459
4048,0.666667,,,,,,,,,,...,,,,,,,,,,
6162,,4,,,,,,,,,...,,,,,,,,,,
7852,,,3.33333,,,,,,,,...,,,,,,,,,,
5960,,,,5.66667,,,,,,,...,,,,,,,,,,
3823,,,,,0.666667,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7674,,,,,,,,,,,...,,,,,,,,,,
3729,,,,,,,,,,,...,,,,,,,,,,
2933,,,,,,,,,,,...,,,,,,,,,,
257,,,,,,,,,,,...,,,,,,,,,,


Update the user-item matrix by normalizing the values between 0 and 10

In [44]:
ucMatrix = ucMatrix/np.nanmax(ucMatrix.values)*10