# Import Library

In [4]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load Dataset

In [5]:
# path
shared_articles = 'archive/shared_articles.csv'
users_interactions = 'archive/users_interactions.csv'

In [6]:
# load dataset
shared_articles = pd.read_csv(shared_articles)
users_interactions = pd.read_csv(users_interactions)

# Shared Articles Dataset

In [7]:
shared_articles = shared_articles[shared_articles['eventType']=='CONTENT SHARED']
shared_articles

Unnamed: 0,timestamp,eventType,contentId,authorPersonId,authorSessionId,authorUserAgent,authorRegion,authorCountry,contentType,url,title,text,lang
1,1459193988,CONTENT SHARED,-4110354420726924665,4340306774493623681,8940341205206233829,,,,HTML,http://www.nytimes.com/2016/03/28/business/dea...,"Ethereum, a Virtual Currency, Enables Transact...",All of this work is still very early. The firs...,en
2,1459194146,CONTENT SHARED,-7292285110016212249,4340306774493623681,8940341205206233829,,,,HTML,http://cointelegraph.com/news/bitcoin-future-w...,Bitcoin Future: When GBPcoin of Branson Wins O...,The alarm clock wakes me at 8:00 with stream o...,en
3,1459194474,CONTENT SHARED,-6151852268067518688,3891637997717104548,-1457532940883382585,,,,HTML,https://cloudplatform.googleblog.com/2016/03/G...,Google Data Center 360° Tour,We're excited to share the Google Data Center ...,en
4,1459194497,CONTENT SHARED,2448026894306402386,4340306774493623681,8940341205206233829,,,,HTML,https://bitcoinmagazine.com/articles/ibm-wants...,"IBM Wants to ""Evolve the Internet"" With Blockc...",The Aite Group projects the blockchain market ...,en
5,1459194522,CONTENT SHARED,-2826566343807132236,4340306774493623681,8940341205206233829,,,,HTML,http://www.coindesk.com/ieee-blockchain-oxford...,IEEE to Talk Blockchain at Cloud Computing Oxf...,One of the largest and oldest organizations fo...,en
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3117,1487946604,CONTENT SHARED,9213260650272029784,3609194402293569455,7144190892417579456,Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebK...,SP,BR,HTML,https://startupi.com.br/2017/02/liga-ventures-...,"Conheça a Liga IoT, plataforma de inovação abe...","A Liga Ventures, aceleradora de startups espec...",pt
3118,1487947067,CONTENT SHARED,-3295913657316686039,6960073744377754728,-8193630595542572738,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3...,GA,US,HTML,https://thenextweb.com/apps/2017/02/14/amazon-...,Amazon takes on Skype and GoToMeeting with its...,"Amazon has launched Chime, a video conferencin...",en
3119,1488223224,CONTENT SHARED,3618271604906293310,1908339160857512799,-183341653743161643,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0...,SP,BR,HTML,https://code.org/about/2016,Code.org 2016 Annual Report,"February 9, 2017 - We begin each year with a l...",en
3120,1488300719,CONTENT SHARED,6607431762270322325,-1393866732742189886,2367029511384577082,Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...,MG,BR,HTML,https://www.bloomberg.com/news/articles/2017-0...,JPMorgan Software Does in Seconds What Took La...,"At JPMorgan Chase & Co., a learning machine is...",en


# Users Interactions Dataset

In [8]:
users_interactions

Unnamed: 0,timestamp,eventType,contentId,personId,sessionId,userAgent,userRegion,userCountry
0,1465413032,VIEW,-3499919498720038879,-8845298781299428018,1264196770339959068,,,
1,1465412560,VIEW,8890720798209849691,-1032019229384696495,3621737643587579081,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2...,NY,US
2,1465416190,VIEW,310515487419366995,-1130272294246983140,2631864456530402479,,,
3,1465413895,FOLLOW,310515487419366995,344280948527967603,-3167637573980064150,,,
4,1465412290,VIEW,-7820640624231356730,-445337111692715325,5611481178424124714,,,
...,...,...,...,...,...,...,...,...
72307,1485190425,LIKE,-6590819806697898649,-9016528795238256703,8614469745607949425,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4...,MG,BR
72308,1485190425,VIEW,-5813211845057621660,102305705598210278,5527770709392883642,Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...,SP,BR
72309,1485190072,VIEW,-1999468346928419252,-9196668942822132778,-8300596454915870873,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,SP,BR
72310,1485190434,VIEW,-6590819806697898649,-9016528795238256703,8614469745607949425,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4...,MG,BR


# Exploratory Data Analysis

### Shared Articles Dataset

In [9]:
shared_articles.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3047 entries, 1 to 3121
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   timestamp        3047 non-null   int64 
 1   eventType        3047 non-null   object
 2   contentId        3047 non-null   int64 
 3   authorPersonId   3047 non-null   int64 
 4   authorSessionId  3047 non-null   int64 
 5   authorUserAgent  669 non-null    object
 6   authorRegion     669 non-null    object
 7   authorCountry    669 non-null    object
 8   contentType      3047 non-null   object
 9   url              3047 non-null   object
 10  title            3047 non-null   object
 11  text             3047 non-null   object
 12  lang             3047 non-null   object
dtypes: int64(4), object(9)
memory usage: 333.3+ KB


Terdapat missing value pada kolom authorUserAgent, authorRegion, dan authorCountry.

In [10]:
shared_articles.describe(include='all')

Unnamed: 0,timestamp,eventType,contentId,authorPersonId,authorSessionId,authorUserAgent,authorRegion,authorCountry,contentType,url,title,text,lang
count,3047.0,3047,3047.0,3047.0,3047.0,669,669,669,3047,3047,3047,3047,3047
unique,,1,,,,114,19,5,3,3023,3003,3013,5
top,,CONTENT SHARED,,,,Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...,SP,BR,HTML,http://www.popsci.com/byzantine-science-deceiv...,Dries Buytaert,Bug Type 3: Missing Specifications Description...,en
freq,,3047,,,,70,524,602,3027,4,4,4,2211
mean,1468865000.0,,1.969568e+16,4.198685e+17,1.694961e+17,,,,,,,,
std,7573604.0,,5.376353e+18,4.390382e+18,5.391587e+18,,,,,,,,
min,1459194000.0,,-9.222795e+18,-9.120686e+18,-9.212055e+18,,,,,,,,
25%,1462401000.0,,-4.67342e+18,-1.570135e+18,-4.656768e+18,,,,,,,,
50%,1467176000.0,,3.455744e+16,-7.092877e+17,3.910429e+17,,,,,,,,
75%,1473944000.0,,4.716572e+18,3.609194e+18,4.821078e+18,,,,,,,,


In [11]:
shared_articles.isna().sum()

timestamp             0
eventType             0
contentId             0
authorPersonId        0
authorSessionId       0
authorUserAgent    2378
authorRegion       2378
authorCountry      2378
contentType           0
url                   0
title                 0
text                  0
lang                  0
dtype: int64

In [12]:
shared_articles.duplicated().sum()

0

### Users Interactions Dataset

In [13]:
users_interactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72312 entries, 0 to 72311
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   timestamp    72312 non-null  int64 
 1   eventType    72312 non-null  object
 2   contentId    72312 non-null  int64 
 3   personId     72312 non-null  int64 
 4   sessionId    72312 non-null  int64 
 5   userAgent    56918 non-null  object
 6   userRegion   56907 non-null  object
 7   userCountry  56918 non-null  object
dtypes: int64(4), object(4)
memory usage: 4.4+ MB


In [14]:
users_interactions.describe(include='all')

Unnamed: 0,timestamp,eventType,contentId,personId,sessionId,userAgent,userRegion,userCountry
count,72312.0,72312,72312.0,72312.0,72312.0,56918,56907,56918
unique,,5,,,,1090,71,23
top,,VIEW,,,,Android - Native Mobile App,SP,BR
freq,,61086,,,,6761,40164,51396
mean,1470103000.0,,-3.033423e+16,1.252026e+16,3.421273e+16,,,
std,7258130.0,,5.344755e+18,5.022333e+18,5.344355e+18,,,
min,1457964000.0,,-9.222795e+18,-9.223122e+18,-9.222505e+18,,,
25%,1464876000.0,,-4.726309e+18,-3.596627e+18,-4.613476e+18,,,
50%,1468343000.0,,1.893099e+16,-1.088422e+17,5.029492e+16,,,
75%,1474461000.0,,4.441012e+18,3.766319e+18,4.667962e+18,,,


In [15]:
users_interactions.isna().sum()

timestamp          0
eventType          0
contentId          0
personId           0
sessionId          0
userAgent      15394
userRegion     15405
userCountry    15394
dtype: int64

Terdapat beberapa missing value pada kolom userAgent, userRegion, dan userCountry.

In [16]:
users_interactions.duplicated().sum()

11

Terdapat data yang duplikat pada dataset user interactions.

# Univariate Analysis

Karena beberapa fitur kurang relevan dengan content-based filtering model, maka dipilih beberapa fitur yang dianggap relevan untuk dianalisa lebih lanjut.

### Shared Articles Dataset

In [17]:
features = ['eventType','contentId','contentType','url','title','text','lang']

In [18]:
shared_articles.groupby('eventType').agg({
    'text':'count'
})

Unnamed: 0_level_0,text
eventType,Unnamed: 1_level_1
CONTENT SHARED,3047


In [19]:
shared_articles.groupby('contentId').agg({
    'text':'count'
})

Unnamed: 0_level_0,text
contentId,Unnamed: 1_level_1
-9222795471790223670,1
-9216926795620865886,1
-9194572880052200111,1
-9192549002213406534,1
-9190737901804729417,1
...,...
9213260650272029784,1
9215261273565326920,1
9217155070834564627,1
9220445660318725468,1


In [20]:
shared_articles.groupby('contentType').agg({
    'text':'count'
})

Unnamed: 0_level_0,text
contentType,Unnamed: 1_level_1
HTML,3027
RICH,10
VIDEO,10


In [21]:
shared_articles.groupby('title').agg({
    'text':'count'
})

Unnamed: 0_level_0,text
title,Unnamed: 1_level_1
"""""Good Luck with That!"": Teaching Machines to Detect Sarcasm"" by Xinyi Ou",1
"""5G está para IoT como 4G esteve para o Smartphone"", saiba as apostas da Intel para 2017",1
"""Consultorias promovem a desvalorização do nosso negócio"" - Meio & Mensagem",1
"""Disrupção não depende da tecnologia, depende das pessoas"": as lições do cofundador do Waze - Startupi",1
"""Eat, sleep, code, repeat"" is such bullshit - Signal v. Noise",1
...,...
ŷhat | Classifying handwritten digits using TensorFlow,1
​Google digs deeper on machine learning with new European research lab,1
​Google's AI software is moving into your iPhone,1
​Kubernetes 1.4: One DevOps tool to rule all the containers | ZDNet,1


In [22]:
shared_articles.groupby('lang').agg({
    'text':'count'
})

Unnamed: 0_level_0,text
lang,Unnamed: 1_level_1
en,2211
es,2
ja,2
la,3
pt,829


Berdasarkan data, dapat dilihat bahwa pada kolom lang artikel dengan bahasa 'es', 'ja', dan 'la' memiliki jumlah yang sangat sedikit.

### Users Interactions Dataset

In [23]:
df_features = ['eventType','contentId']

In [24]:
users_interactions.groupby('eventType').agg({
    'timestamp':'count'
})

Unnamed: 0_level_0,timestamp
eventType,Unnamed: 1_level_1
BOOKMARK,2463
COMMENT CREATED,1611
FOLLOW,1407
LIKE,5745
VIEW,61086


Berdasarkan interaksi pada kolom eventType, interaksi - interaksi tersebut dapat dikaitkan dengan bobot. Sebagai contoh, artikel yang diikuti oleh pengguna menunjukkan ketertarikan dibandingkan dengan artikel yang hanya dilihat oleh pengguna.

In [25]:
users_interactions.groupby('contentId').agg({
    'timestamp':'count'
})

Unnamed: 0_level_0,timestamp
contentId,Unnamed: 1_level_1
-9222795471790223670,26
-9216926795620865886,21
-9194572880052200111,29
-9192549002213406534,56
-9190737901804729417,9
...,...
9213260650272029784,11
9215261273565326920,30
9217155070834564627,16
9220445660318725468,52


# Data Preprocessing

In [26]:
# membuat dataset baru berdasarkan fitur - fitur tertentu dari dataset shared articles dan users interactions

df_sa = shared_articles[['contentId','title','text','lang']]
df_ui = users_interactions[['contentId','eventType','personId']]

### Menghapus data duplikat

In [27]:
df_sa = df_sa.drop_duplicates()
df_ui = df_ui.drop_duplicates()

In [28]:
df_sa.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3047 entries, 1 to 3121
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   contentId  3047 non-null   int64 
 1   title      3047 non-null   object
 2   text       3047 non-null   object
 3   lang       3047 non-null   object
dtypes: int64(1), object(3)
memory usage: 119.0+ KB


In [29]:
df_ui.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 50938 entries, 0 to 72311
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   contentId  50938 non-null  int64 
 1   eventType  50938 non-null  object
 2   personId   50938 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 1.6+ MB


### Mengubah eventType pada dataset users interactions 

In [30]:
event_type_strength = {
   'VIEW': 1.0,
   'LIKE': 2.0, 
   'BOOKMARK': 3.0, 
   'FOLLOW': 4.0,
   'COMMENT CREATED': 5.0,  
}

df_ui['rating'] = df_ui['eventType'].apply(lambda x: event_type_strength[x])

In [31]:
df_ui.describe()

Unnamed: 0,contentId,personId,rating
count,50938.0,50938.0,50938.0
mean,-4.172038e+16,-3.33222e+16,1.392163
std,5.357533e+18,5.055287e+18,0.908675
min,-9.222795e+18,-9.223122e+18,1.0
25%,-4.754224e+18,-3.620818e+18,1.0
50%,-1038011000000000.0,-1.088422e+17,1.0
75%,4.558204e+18,3.803524e+18,1.0
max,9.222265e+18,9.210531e+18,5.0


Di Deskdrop, pengguna diizinkan untuk melihat artikel berkali-kali, dan berinteraksi dengannya dengan cara yang berbeda (mis. Suka atau komentar). Oleh karena itu, untuk memodelkan minat pengguna pada artikel tertentu, dilakukan penggabungan semua interaksi yang telah dilakukan pengguna dalam item dengan penjumlahan berbobot dari jenis interaksi.

In [32]:
df_ui = df_ui.groupby(['personId','contentId']).agg({
    'rating':'sum'
}).reset_index()

In [33]:
df_ui.shape

(40710, 3)

In [34]:
df_ui['rating'].sort_values(ascending=False)

4499     15.0
9651     15.0
2852     15.0
15155    15.0
40285    15.0
         ... 
14552     1.0
14551     1.0
14550     1.0
14549     1.0
40709     1.0
Name: rating, Length: 40710, dtype: float64

### Menggabungkan kedua dataset berdasarkan contentId

In [35]:
df = pd.merge(df_sa,df_ui,how='outer',on='contentId')
df

Unnamed: 0,contentId,title,text,lang,personId,rating
0,-4110354420726924665,"Ethereum, a Virtual Currency, Enables Transact...",All of this work is still very early. The firs...,en,2.873028e+18,1.0
1,-7292285110016212249,Bitcoin Future: When GBPcoin of Branson Wins O...,The alarm clock wakes me at 8:00 with stream o...,en,8.414731e+18,1.0
2,-6151852268067518688,Google Data Center 360° Tour,We're excited to share the Google Data Center ...,en,-7.267770e+18,3.0
3,-6151852268067518688,Google Data Center 360° Tour,We're excited to share the Google Data Center ...,en,-1.443637e+18,1.0
4,-6151852268067518688,Google Data Center 360° Tour,We're excited to share the Google Data Center ...,en,-1.032019e+18,3.0
...,...,...,...,...,...,...
40773,-1172724258904585136,,,,-1.443637e+18,1.0
40774,-8418620743404378592,,,,-2.050699e+18,1.0
40775,-6451309518266745024,,,,-1.578288e+18,1.0
40776,-6451309518266745024,,,,4.340307e+18,1.0


# Content-based filtering

Untuk kasus model content-based filtering, akan digunakan artikel dengan bahasa 'en'.

In [36]:
df.groupby('lang').count()

Unnamed: 0_level_0,contentId,title,text,personId,rating
lang,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
en,26255,26255,26255,26193,26193
es,6,6,6,6,6
ja,23,23,23,23,23
la,26,26,26,26,26
pt,14440,14440,14440,14434,14434


In [37]:
df = df[df['lang']=='en']

In [38]:
df

Unnamed: 0,contentId,title,text,lang,personId,rating
0,-4110354420726924665,"Ethereum, a Virtual Currency, Enables Transact...",All of this work is still very early. The firs...,en,2.873028e+18,1.0
1,-7292285110016212249,Bitcoin Future: When GBPcoin of Branson Wins O...,The alarm clock wakes me at 8:00 with stream o...,en,8.414731e+18,1.0
2,-6151852268067518688,Google Data Center 360° Tour,We're excited to share the Google Data Center ...,en,-7.267770e+18,3.0
3,-6151852268067518688,Google Data Center 360° Tour,We're excited to share the Google Data Center ...,en,-1.443637e+18,1.0
4,-6151852268067518688,Google Data Center 360° Tour,We're excited to share the Google Data Center ...,en,-1.032019e+18,3.0
...,...,...,...,...,...,...
40745,-3295913657316686039,Amazon takes on Skype and GoToMeeting with its...,"Amazon has launched Chime, a video conferencin...",en,6.960074e+18,1.0
40746,3618271604906293310,Code.org 2016 Annual Report,"February 9, 2017 - We begin each year with a l...",en,1.908339e+18,1.0
40747,6607431762270322325,JPMorgan Software Does in Seconds What Took La...,"At JPMorgan Chase & Co., a learning machine is...",en,-1.393867e+18,1.0
40748,4109618890343020064,The 2017 Acquia Partners of the Year,The Acquia Partner Awards Program is comprised...,en,-3.954277e+18,1.0


### Menjumlahkan seluruh rating user terhadap suatu konten untuk evaluasi

In [39]:
df = df.groupby(['title']).agg({
    'text':'first',
    'rating':'sum'
}).reset_index()

In [40]:
df.describe(include='all')

Unnamed: 0,title,text,rating
count,2173,2173,2173.0
unique,2173,2169,
top,"""""Good Luck with That!"": Teaching Machines to ...",Create a FREE account to: Get eight free artic...,
freq,1,3,
mean,,,20.936954
std,,,27.672166
min,,,0.0
25%,,,5.0
50%,,,12.0
75%,,,27.0


### Modeling

In [41]:
# list stopwords dalam bahasa Inggris
stopwords_list = stopwords.words('english') 

vectorizer = TfidfVectorizer(analyzer='word',
                     ngram_range=(1, 2),
                     min_df=0.003,
                     max_df=0.5,
                     max_features=5000,
                     stop_words=stopwords_list)

In [42]:
tfidf_matrix = vectorizer.fit_transform(df['title'] + " " + df['text'])

In [43]:
cosine_sim = cosine_similarity(tfidf_matrix)
cosine_sim_df = pd.DataFrame(cosine_sim, index=df['title'], columns=df['title'])

In [44]:
def get_recommendations(title, similarity_data=cosine_sim_df, items=df[['title','rating']], k=10):
  
  index = similarity_data.loc[:,title].to_numpy().argpartition(range(-1, -k, -1))
  closest = similarity_data.columns[index[-1:-(k+2):-1]]
  closest = closest.drop(title, errors='ignore')
  return pd.DataFrame(closest).merge(items).head(k)

### Mendapatkan rekomendasi

In [45]:
get_recommendations("An operating model for company-wide agile development")

Unnamed: 0,title,rating
0,Embracing Agile,188.0
1,"Agile is Dead, Long Live Continuous Delivery -...",24.0
2,12 Agile principles,1.0
3,Scrum Community - Scrum Alliance,64.0
4,Big IT Rising,50.0
5,Organizing for digital acceleration: Making a ...,28.0
6,How enterprise architects can help ensure succ...,57.0
7,The new tech talent you need to succeed in dig...,34.0
8,Do you want Crappy Agile?,43.0
9,Five questions boards should ask about IT in a...,58.0


# Evaluasi

### Precision@k

In [46]:
k=10
threshold = 21
sc = get_recommendations('An operating model for company-wide agile development')
ratings = sc['rating'].values
sum(ratings>threshold)/k


0.9