- 数据可视化探索分析
- 数据切割成测试集和训练集(28或者K-fold crossvalidation)需要吗？
- 训练集 User-based CF 和Item-based CF
- 用训练集的user展示
- 用测试集进行评估性能

# Data exploring

In [1]:
import pandas as pd
import numpy as np

In [57]:
dfg = pd.read_csv("../data/elearning_dataset.csv")
dfg

Unnamed: 0,Date,UserID,CourseID,Event
0,2021-01-10 10:13:00,4048,455,view_course
1,2021-01-10 10:14:12,6162,742,view_course
2,2021-01-10 10:14:52,7852,652,view_course
3,2021-01-10 10:15:40,6162,742,rundown
4,2021-01-10 10:16:35,7852,652,teacher_profile
...,...,...,...,...
52404,2021-03-09 15:16:09,1143,3609,detailed_description
52405,2021-03-09 15:17:52,1273,245,view_course
52406,2021-03-09 15:19:43,1273,245,detailed_description
52407,2021-03-09 15:20:44,1273,245,institution


In [36]:
# detect basic information
dfg.Event.unique()

len(dfg.UserID.unique()) #7747个用户
len(dfg.CourseID.unique()) # 2076个课程
dfg.describe(include = 'O')

Unnamed: 0,Event
count,52409
unique,5
top,view_course
freq,27001


## Data cleaning

In [12]:
# 基本不需要清洗，都是干净的
dfg.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52409 entries, 0 to 52408
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Date      52409 non-null  object
 1   UserID    52409 non-null  int64 
 2   CourseID  52409 non-null  int64 
 3   Event     52409 non-null  object
dtypes: int64(2), object(2)
memory usage: 1.6+ MB


## Data preparing

In [19]:
#时间处理特别重要,获取月日属性
#dfg['Date'] = pd.to_datetime(dfg['Date'])
#dfg["Month"] = dfg['Date'].dt.month
#dfg['Day'] = dfg['Date'].dt.day

In [58]:
groupbyEvent = dfg.groupby("Event").agg(['count'])
groupbyEvent.index.values.tolist()
#groupbyEvent[('Date', 'count')].values

['detailed_description',
 'institution',
 'rundown',
 'teacher_profile',
 'view_course']

# Data visualization

In [59]:
from pyecharts import options as opts
from pyecharts.charts import Bar

c = (
    Bar()
    .add_xaxis(groupbyEvent.index.values.tolist())
    .add_yaxis("Event",groupbyEvent[('Date', 'count')].values.tolist())
    .set_global_opts(
        xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=-15)),
        title_opts=opts.TitleOpts(title="用户事件分布", subtitle="-"),
    )
    .render("用户事件分布.html")
)

# Recommender

## Behavioural Implicit Ratings

Using the formula introduced during lecture

$${IR}_(i,u) = \left(w_1*{\#event}_1\right)+\left(w_2*{\#event}_2\right)+\dots+\left(w_n*{\#event}_n\right)$$

1. Create a user-course binary matrix

In [60]:
ucMatrix = pd.DataFrame(columns = dfg.CourseID.unique(),index = dfg.UserID.unique())
ucMatrix

Unnamed: 0,455,742,652,3694,86,1704,1804,433,3253,1103,...,2188,3554,869,3332,36,3227,3425,428,2702,3459
4048,,,,,,,,,,,...,,,,,,,,,,
6162,,,,,,,,,,,...,,,,,,,,,,
7852,,,,,,,,,,,...,,,,,,,,,,
5960,,,,,,,,,,,...,,,,,,,,,,
3823,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7674,,,,,,,,,,,...,,,,,,,,,,
3729,,,,,,,,,,,...,,,,,,,,,,
2933,,,,,,,,,,,...,,,,,,,,,,
257,,,,,,,,,,,...,,,,,,,,,,


2. Give a weight to each of event

In [61]:
eventWeights = {
    'detailed_description': 25,
    'institution': 25,
    'teacher_profile': 40,
    'rundown': 50,
    'view_course': 10}

In [62]:
dfg

Unnamed: 0,Date,UserID,CourseID,Event
0,2021-01-10 10:13:00,4048,455,view_course
1,2021-01-10 10:14:12,6162,742,view_course
2,2021-01-10 10:14:52,7852,652,view_course
3,2021-01-10 10:15:40,6162,742,rundown
4,2021-01-10 10:16:35,7852,652,teacher_profile
...,...,...,...,...
52404,2021-03-09 15:16:09,1143,3609,detailed_description
52405,2021-03-09 15:17:52,1273,245,view_course
52406,2021-03-09 15:19:43,1273,245,detailed_description
52407,2021-03-09 15:20:44,1273,245,institution


Compute the Implicit Rating for each users-courses combination.
Populate the users-courses matrix `ucMatrix` with the IR values.

In [41]:
# # Iterate the evidence
# for index,row in dfg.iterrows():
#     # select the user and items involved
#     currentUser = row['UserID']
#     currentCourse = row['CourseID']
#     currentEvent = row['Event']
#     # Extract the appropriate weight for the event
#     w = eventWeights[currentEvent]
#     # Find the value eventually stored for the current users-courses combination
#     currentValue = ucMatrix.at[currentUser, currentCourse]
#     if np.isnan(currentValue):
#         currentValue = 0        
#     # Compute the new value and update the user-item matrix
#     updatedValue = currentValue + w #+ (1 * w)
#     ucMatrix.at[currentUser, currentCourse] = updatedValue

Limit the number of relevant events to a specific threshold (e.g. 10).

In [50]:
# 给每个用户建立个计数器
# users = dfg.UserID.unique()
# counter = {
#     'detailed_description':0, 'institution':0, 'teacher_profile':0, 'rundown':0, 'view_course':0
# }
# users_counter = {user:counter for user in users}
# users_counter[4048]['detailed_description']

0

In [None]:
# # Exercise 2-2
# # throw a threshold to filter out more related events
# threshold = 10
# for index,row in dfg.iterrows():
#     # select the user and items involved
#     currentUser = row['UserID']
#     currentCourse = row['CourseID']
#     currentEvent = row['Event']
#     # Extract the appropriate weight for the event
#     w = eventWeights[currentEvent]
#     # Find the value eventually stored for the current users-courses combination
#     currentValue = ucMatrix.at[currentUser, currentCourse]
#     if np.isnan(currentValue):
#         currentValue = 0  
#     if users_counter[currentUser][currentEvent] > threshold:
#         pass
#     else:
#     # Compute the new value and update the user-item matrix
#         updatedValue = currentValue + w #+ (1 * w)
#         ucMatrix.at[currentUser, currentCourse] = updatedValue

In [45]:
ucMatrix

Unnamed: 0,455,742,652,3694,86,1704,1804,433,3253,1103,...,2188,3554,869,3332,36,3227,3425,428,2702,3459
4048,0.666667,,,,,,,,,,...,,,,,,,,,,
6162,,4,,,,,,,,,...,,,,,,,,,,
7852,,,3.33333,,,,,,,,...,,,,,,,,,,
5960,,,,5.66667,,,,,,,...,,,,,,,,,,
3823,,,,,0.666667,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7674,,,,,,,,,,,...,,,,,,,,,,
3729,,,,,,,,,,,...,,,,,,,,,,
2933,,,,,,,,,,,...,,,,,,,,,,
257,,,,,,,,,,,...,,,,,,,,,,


Update the user-item matrix by normalizing the values between 0 and 10

In [44]:
ucMatrix = ucMatrix/np.nanmax(ucMatrix.values)*10

## Behavioural Implicit Ratings with Decay
$${IRDecay}_{(i,u)} = \sum_{i=1}^n w_i*{\#event}_i*d\left({\#event}_i\right) = \left(w_1*{\#event}_1*d\left({\#event}_1\right)\right)+\left(w_2*{\#event}_2*d\left({\#event}_2\right)\right)+\dots+\left(w_n*{\#event}_n*d\left({\#event}_n\right)\right)$$

In [73]:
# 看是不是需要根据数据的时间作调整，因为数据比较旧了
import datetime
from datetime import date, timedelta, datetime
def compute_decay(eventDate, decayDays):
    age = (date.today() - datetime.strptime(eventDate, '%Y-%m-%d %H:%M:%S').date()) // timedelta(days=decayDays)
    #print("Age of event:", age)
    decay = 1/age #simple decay
    #print("Decay factor:", decay)
    
    return decay

In [72]:
print(date.today())
#2021-01-10 10:13:00
print(date.today() - datetime.strptime("2021-01-10 10:13:00", '%Y-%m-%d %H:%M:%S').date())

2021-04-14
94 days, 0:00:00


In [51]:
dfg

Unnamed: 0,Date,UserID,CourseID,Event,Month,Day
0,2021-01-10 10:13:00,4048,455,view_course,1,10
1,2021-01-10 10:14:12,6162,742,view_course,1,10
2,2021-01-10 10:14:52,7852,652,view_course,1,10
3,2021-01-10 10:15:40,6162,742,rundown,1,10
4,2021-01-10 10:16:35,7852,652,teacher_profile,1,10
...,...,...,...,...,...,...
52404,2021-03-09 15:16:09,1143,3609,detailed_description,3,9
52405,2021-03-09 15:17:52,1273,245,view_course,3,9
52406,2021-03-09 15:19:43,1273,245,detailed_description,3,9
52407,2021-03-09 15:20:44,1273,245,institution,3,9


In [74]:
for index,row in dfg.iterrows():
    # select the user and items involved
    currentUser = row['UserID']
    currentCourse = row['CourseID']
    currentEvent = row['Event']
    eventDate = row['Date']
    thresholdDays = 2
    # Extract the appropriate weight for the event
    w = eventWeights[currentEvent]
    # Find the value eventually stored for the current users-courses combination
    currentValue = ucMatrix.at[currentUser, currentCourse]
    if np.isnan(currentValue):
        currentValue = 0        
    # Compute the new value and update the user-item matrix
    updatedValue = currentValue + w #+ (1 * w)
    ucMatrix.at[currentUser, currentCourse] = updatedValue
    # Extract the appropriate weight for the event
    w = eventWeights[currentEvent]
    # Find the value eventually stored for the current users-courses combination
    currentValue = ucMatrix.at[currentUser, currentCourse]
    if np.isnan(currentValue):
        currentValue = 0        
    # Compute the new value and update the user-item matrix
    updatedValue = currentValue + w * compute_decay(eventDate,thresholdDays)
    ucMatrix.at[currentUser, currentCourse] = updatedValue

# User-based Collaborative Filtering

## Step 1: Compute Similarity between the active user and the rest of the users
For any user existed in the dataset.

For our needs we need to find one without all the ratings already filled 

- Normalise the matrix 
- Update the users-courses matrix by normalizing the values between 0 and 10.

In [75]:
ucMatrixNorm = ucMatrix.apply(
    lambda x: ((x - np.nanmin(ucMatrix.values))/(np.nanmax(ucMatrix.values) - np.nanmin(ucMatrix.values)))*10
    )

In [113]:
ucMatrixNorm

Unnamed: 0,455,742,652,3694,86,1704,1804,433,3253,1103,...,2188,3554,869,3332,36,3227,3425,428,2702,3459
4048,0.675126,,,,,,,,,,...,,,,,,,,,,
6162,,3.44745,,,,,,,,,...,,,,,,,,,,
7852,,,2.75796,,,,,,,,...,,,,,,,,,,
5960,,,,5.171175,,,,,,,...,,,,,,,,,,
3823,,,,,0.0,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7674,,,,,,,,,,,...,,,,,,,,,,
3729,,,,,,,,,,,...,,,,,,,,,,
2933,,,,,,,,,,,...,,,,,,,,,,
257,,,,,,,,,,,...,,,,,,,,,,


In [115]:
# 查看某一用户有多少用户评价
ucMatrixNorm.loc[:4048].isnull().sum(axis=1)

4048    2073
dtype: int64

In [116]:
ucMatrixNorm.isnull().sum(axis=1)

4048    2073
6162    2071
7852    2070
5960    2072
3823    2075
        ... 
7674    2075
3729    2075
2933    2075
257     2075
6639    2075
Length: 7747, dtype: int64

In [84]:
#currentUser = userId
currentUser = 6162
ucMatrixNorm = ucMatrixNorm.astype(float)

In [86]:
cuDf = ucMatrixNorm.loc[currentUser]
cuDf

455         NaN
742     3.44745
652         NaN
3694        NaN
86          NaN
         ...   
3227        NaN
3425        NaN
428         NaN
2702        NaN
3459        NaN
Name: 6162, Length: 2076, dtype: float64

In [87]:
# 计算其他行跟当前所需预测的用户的pearson相关系数
corrDf = ucMatrixNorm.corrwith(cuDf, axis=1, method='pearson')

  c = cov(x, y, rowvar)
  c *= np.true_divide(1, fact)


In [88]:
# 进行排序
corrDf.sort_values(ascending=False, inplace=True)
corrDf

7008    1.0
6162    1.0
4048    NaN
7852    NaN
5960    NaN
       ... 
7674    NaN
3729    NaN
2933    NaN
257     NaN
6639    NaN
Length: 7747, dtype: float64

In [90]:
corrDf[corrDf>0.3]
corrDf.drop(labels=[currentUser], inplace=True)
# select the top-k with k
corrDf = corrDf.head(3)
#把当前用户没有看过的课的信息筛选出来，也就是需要预测的课
toPredict = cuDf[cuDf.isna()]
toPredict.index
#把准备用来预测的用户信息拿出来，就是最相似的那几个用户
ratings = ucMatrixNorm.loc[corrDf.index]
#找出需要预测的课程信息在相似用户的评价
ratingsToPredict = ratings[toPredict.index]
#选一个手段去预测，加权？平均


KeyError: '[6162] not found in axis'

In [94]:
ratingsToPredict.mean()

455     0.675126
652     2.757960
3694         NaN
86           NaN
1704         NaN
          ...   
3227         NaN
3425         NaN
428          NaN
2702         NaN
3459         NaN
Length: 2071, dtype: float64

# Item-based Collaborative Filtering
## Step 1: Compute Similarity between an item and the rest of the items

In [121]:
#设定好User和Item
#Item-based CF Find items similar to items that you already like
currentUser = 6162
currentCourse = 3425
ucMatrixNorm

Unnamed: 0,455,742,652,3694,86,1704,1804,433,3253,1103,...,2188,3554,869,3332,36,3227,3425,428,2702,3459
4048,0.675126,,,,,,,,,,...,,,,,,,,,,
6162,,3.44745,,,,,,,,,...,,,,,,,,,,
7852,,,2.75796,,,,,,,,...,,,,,,,,,,
5960,,,,5.171175,,,,,,,...,,,,,,,,,,
3823,,,,,0.0,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7674,,,,,,,,,,,...,,,,,,,,,,
3729,,,,,,,,,,,...,,,,,,,,,,
2933,,,,,,,,,,,...,,,,,,,,,,
257,,,,,,,,,,,...,,,,,,,,,,


Find all the co-rated items
convert all the values in the matrix as floating point numbers (previusly were object type)


In [128]:
ucMatrixNorm = ucMatrixNorm.astype(float)
# Drop users that didn't rate actual item
#选中一个一个用户，那个用户也评价了这个课程
#即删除这个课程中没有评价的用户记录
ucMatrixSelection = ucMatrixNorm.dropna(subset = [currentCourse])
ucMatrixSelection
# Drop items that are not co-rated with the actual one
#删除掉评价了课程的用户没有评价的课程信息
ucMatrixSelection = ucMatrixSelection.dropna(axis = 1)
ucMatrixSelection

Unnamed: 0,3423,1951,3425
6242,4.55479,2.764641,3.574462


In [132]:
#Compute the average rating for the current user
# 计算
cuAvgRating = ucMatrixNorm.loc[[currentUser]].dropna(axis=1).mean(axis=1)
cuAvgRating

6162    2.441352
dtype: float64

In [130]:
ouAvgRating = ucMatrixSelection.mean(axis=1)
ouAvgRating

6242    3.631298
dtype: float64