In [1]:
import numpy as np
import scipy
import pandas as pd
import math
import random
import sklearn
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse.linalg import svds
import matplotlib.pyplot as plt

In [2]:
articles_df = (
  pd.read_csv('shared_articles.csv')
    .loc[lambda d: d['eventType'] == 'CONTENT SHARED']  
)

In [3]:
interaction_df = pd.read_csv('users_interactiontea.csv')

In [4]:
interaction_df.head(5)

Unnamed: 0,timestamp,eventType,contentId,personId,sessionId,userAgent,userRegion,userCountry
0,1465413032,VIEW,a18,-8.85e+18,1.26e+18,,,
1,1465412560,VIEW,a41,-1.03e+18,3.62e+18,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2...,NY,US
2,1465416190,VIEW,a34,-1.13e+18,2.63e+18,,,
3,1465413895,FOLLOW,a25,3.44e+17,-3.17e+18,,,
4,1465412290,VIEW,a63,-4.45e+17,5.61e+18,,,


In [7]:
interaction_df_over5 = (interaction_df
  .groupby('personId', group_keys=False)
  .apply(lambda df: df.assign(interactCnt = lambda d: d['contentId'].nunique()))
  .loc[lambda d: d['interactCnt'] >= 5]
)

In [8]:
event_type_strength = {
   'VIEW': 1.0,
   'LIKE': 2.0, 
   'BOOKMARK': 2.5, 
   'FOLLOW': 3.0,
   'COMMENT CREATED': 4.0,  
}

interaction_df['eventStrength'] = (
  interaction_df
    .loc[:, 'eventType']
    .apply(lambda d: event_type_strength[d])
)

In [9]:
interaction_df_over5.shape[0]

9146

In [10]:
interaction_full_df = (
  interaction_df_over5
    .groupby(['personId', 'contentId'], as_index=False)['eventStrength']
    .sum()
    .assign(eventScore = lambda d: np.log10(1+d['eventStrength']))
)

interaction_full_df.head(10)

Unnamed: 0,personId,contentId,eventStrength,eventScore
0,-9.22e+18,a1,1.0,0.30103
1,-9.22e+18,a23,1.0,0.30103
2,-9.22e+18,a30,1.0,0.30103
3,-9.22e+18,a58,1.0,0.30103
4,-9.22e+18,a66,1.0,0.30103
5,-9.17e+18,a14,1.0,0.30103
6,-9.17e+18,a20,1.0,0.30103
7,-9.17e+18,a21,1.0,0.30103
8,-9.17e+18,a23,1.0,0.30103
9,-9.17e+18,a4,1.0,0.30103


In [11]:
item_popularity = (interaction_full_df
 .groupby('contentId')['eventStrength'].sum()
 .sort_values(ascending=False)
 .reset_index()
)

item_popularity.head(10)

Unnamed: 0,contentId,eventStrength
0,a11,198.0
1,a58,195.0
2,a61,189.0
3,a27,187.0
4,a4,184.0
5,a50,181.5
6,a66,181.5
7,a2,181.0
8,a44,181.0
9,a65,180.0


In [12]:
item_popularity.nunique()

contentId        70
eventStrength    51
dtype: int64