In [1]:
import numpy as np
import pandas as pd
import scipy.stats as st
import os
import matplotlib.pyplot as plt

In [2]:
lesson_ids = pd.read_csv('all_lesson_items.csv')

user_ids = pd.read_csv('sample_user_ids.csv')

lesson_bookmarks = pd.read_csv('lesson_bookmarks.csv')
lesson_comment_votes = pd.read_csv('lesson_comment_votes.csv')
lesson_comments = pd.read_csv('lesson_comments.csv')
lesson_impressions = pd.read_csv('lesson_impressions.csv')

user_lesson_bookmarks = pd.read_csv('user_lesson_bookmarks.csv')
user_lesson_comment_votes = pd.read_csv('user_lesson_comment_votes.csv')
user_lesson_comments = pd.read_csv('user_lesson_comments.csv')
user_lesson_impressions = pd.read_csv('user_lesson_impressions.csv')

In [3]:
lesson_ids.shape

(1319, 1)

In [4]:
user_ids.shape

(10024, 1)

In [5]:
item_bookmarks = pd.concat([lesson_bookmarks,user_lesson_bookmarks])
item_comment_votes = pd.concat([lesson_comment_votes,user_lesson_comment_votes])
item_comments = pd.concat([lesson_comments,user_lesson_comments])
item_impressions = pd.concat([lesson_impressions,user_lesson_impressions])

In [6]:
df = pd.DataFrame(np.repeat(user_ids.values,len(lesson_ids),axis=0),columns=['UserId'])
df2 = pd.concat([lesson_ids]*len(user_ids), ignore_index=True)

In [7]:
df['LessonId'] = df2.ItemId
df.head()

Unnamed: 0,UserId,LessonId
0,2080,U322
1,2080,U634
2,2080,U642
3,2080,U643
4,2080,U649


In [8]:
df = pd.merge(df,item_bookmarks,on=['UserId','LessonId'],how='left')
df = pd.merge(df,item_comment_votes,on=['UserId','LessonId'],how='left')
df = pd.merge(df,item_comments,on=['UserId','LessonId'],how='left')
df = pd.merge(df,item_impressions,on=['UserId','LessonId'],how='left')

In [9]:
df.sample(10)

Unnamed: 0,UserId,LessonId,Bookmark,Votes,Comments,Views
4190432,8861666,L2930,,,,
10066775,11460795,U708,,,,
9804002,11340283,L2712,,,,
10530598,11678802,L2452,,,,
9086257,11011031,L2492,,,,
5048926,9237925,L2378,,,,
6178623,9731566,U4192,,,,
3507450,8195185,U1062,,,,
10441839,11637060,L2251,,,,
6443440,9849916,U7666,,,,


In [10]:
df.shape

(13221656, 6)

In [11]:
df.fillna(0,inplace=True)

In [12]:
# items watched by no one
set(lesson_ids.ItemId.values)-set(df[df.Views!=0].LessonId)

{'U1010', 'U16839'}

In [13]:
columns = ['Views', 'Comments', 'Votes', 'Bookmark']

percentiles = [50,75,80,90,95,99,99.5,99.9]

thresholds = np.arange(1,11)

In [14]:
non_zero_arrays = {}

for column in columns:
    non_zero_arrays[column] = df[df[column]>0][column].values

for column in columns:
    print("Total User-{}:\t {}".format(column, len(non_zero_arrays[column])))

Total User-Views:	 267805
Total User-Comments:	 809
Total User-Votes:	 4484
Total User-Bookmark:	 2912


In [15]:
print('{:<30} {:<30} {:<30}'.format('Activity', 'Percentile', 'Threshold'))
for column in columns:
    print('------------------------------------------------------------------------')
    for percentile in percentiles:
        print('{:<30} {:<30} {:<30.0f}'.format(column, percentile, np.percentile(non_zero_arrays[column],percentile)))
        

Activity                       Percentile                     Threshold                     
------------------------------------------------------------------------
Views                          50                             1                             
Views                          75                             2                             
Views                          80                             2                             
Views                          90                             3                             
Views                          95                             4                             
Views                          99                             7                             
Views                          99.5                           9                             
Views                          99.9                           14                            
------------------------------------------------------------------------
Comments         

In [16]:
print('{:<30} {:<30} {:<30}'.format('Activity', 'Threshold', 'Percentile'))
thres = {}
for column in columns:
    thres[column] = {}
    print('------------------------------------------------------------------------')
    for threshold in range(1,21):
        thres[column][threshold] = np.around(st.percentileofscore(non_zero_arrays[column],threshold,'weak'),3)
        print('{:<30} {:<30} {:<30.3f}'.format(column, threshold, thres[column][threshold]))

Activity                       Threshold                      Percentile                    
------------------------------------------------------------------------
Views                          1                              69.110                        
Views                          2                              86.912                        
Views                          3                              93.565                        
Views                          4                              96.511                        
Views                          5                              97.938                        
Views                          6                              98.729                        
Views                          7                              99.175                        
Views                          8                              99.438                        
Views                          9                              99.611                      

In [17]:
def calc_rating(col_name):
    return lambda x: 0 if x == 0 else (100 if x > len(thres[col_name]) else thres[col_name][x])

In [18]:
df['Rating'] = (df.Views.apply(calc_rating('Views'))*0.2
                + df.Comments.apply(calc_rating('Comments'))*0.4
                + df.Votes.apply(calc_rating('Votes'))*0.1
                + df.Bookmark.apply(calc_rating('Bookmark'))*0.3)/10

In [19]:
df.Rating.value_counts().sort_index(ascending = True)

0.00000    12951745
0.63671        1150
0.80575         343
0.87021         118
0.90633          48
0.93087          41
0.94692          19
0.95517           7
0.96231          10
0.96789           5
0.97324           3
0.97636           1
0.97971           1
0.98372           2
0.99153           2
1.00000           1
1.38220      182941
1.73824       46243
1.87130       17002
1.93022        7479
1.95876        3536
1.97458        1956
1.98350        1099
1.98876         621
1.99222         422
1.99438         252
1.99606         204
1.99714         131
1.99792          91
1.99850          69
             ...   
6.70431           1
6.72922           1
6.74176           1
6.80063           1
6.80609           2
6.82287           1
6.84187           1
6.84947           1
6.85855           1
6.88603           1
6.89066           1
6.89733           1
6.89891           1
6.90988           1
6.94644           1
6.94739           1
6.97112           1
6.97884           1
6.97888           1


In [20]:
len(np.unique(df.Rating)), len(df.Rating)

(421, 13221656)

In [21]:
df[df.Rating>8]

Unnamed: 0,UserId,LessonId,Bookmark,Votes,Comments,Views,Rating
647352,1891892,L2472,1.0,2.0,2.0,1.0,8.93083
730326,2146163,L2153,1.0,11.0,1.0,11.0,9.22582
888701,2628397,L2446,1.0,17.0,1.0,3.0,9.11355
1208598,3570926,U15173,1.0,0.0,1.0,4.0,8.18362
1254493,3696102,U7378,1.0,0.0,1.0,4.0,8.18362
1676701,4811660,U4537,1.0,1.0,1.0,12.0,8.88725
1688904,4839869,L1035,1.0,1.0,1.0,2.0,8.62835
2692531,6938762,U4631,1.0,1.0,1.0,4.0,8.82033
2906508,7308651,L1603,1.0,10.0,3.0,3.0,9.75554
3305129,7917906,L2465,1.0,0.0,1.0,3.0,8.1247


In [22]:
final_df = df[['UserId','LessonId','Rating']].copy()

In [23]:
# final_df[final_df.Rating>0].to_csv('rating_matrix.csv',index=False)