In [2]:
import pandas as pd
import numpy as np

In [3]:
article_info = pd.read_csv("../input/dacon-data/article_info.csv")
view_log = pd.read_csv("../input/dacon-data/view_log.csv")
sample_sub = pd.read_csv("../input/dacon-data/sample_submission.csv")

In [4]:
article_info.head()

Unnamed: 0,articleID,Title,Content,Format,Language,userID,userCountry,userRegion
0,ARTICLE_0000,19 Tips For Everyday Git Use,I've been using git full time for the past 4 y...,HTML,en,USER_0683,,
1,ARTICLE_0001,Intel buys computer vision startup Itseez to i...,Intel has acquired computer vision and machine...,HTML,en,USER_1129,,
2,ARTICLE_0002,Practical End-to-End Testing with Protractor,One of the reasons AngularJS is so great to wo...,HTML,en,USER_0256,,
3,ARTICLE_0003,Corporate venture growth in Brazil is another ...,Despite recent positive news and a renewed int...,HTML,en,USER_1304,,
4,ARTICLE_0004,Cross-channel user experiences with Drupal (aw...,"Last year around this time, I wrote that The B...",HTML,en,USER_0336,,


In [5]:
view_log.head()

Unnamed: 0,userID,articleID,userRegion,userCountry
0,USER_0000,ARTICLE_0661,NY,US
1,USER_0000,ARTICLE_2316,NY,US
2,USER_0000,ARTICLE_1345,NY,US
3,USER_0000,ARTICLE_1089,NY,US
4,USER_0000,ARTICLE_1484,NY,US


In [6]:
sample_sub.head()

Unnamed: 0,userID,articleID
0,USER_0000,ARTICLE_0000
1,USER_0000,ARTICLE_0001
2,USER_0000,ARTICLE_0002
3,USER_0000,ARTICLE_0003
4,USER_0000,ARTICLE_0004


### Baseline

In [39]:
from sklearn.metrics.pairwise import cosine_similarity

import matplotlib.pyplot as plt
import seaborn as sns

In [12]:
# user-article matrix
user_article_matrix = view_log.groupby(['userID', 'articleID']).size().unstack(fill_value=0)

```python
df.groupby(['class', 'survived']).size()
# size()는 dataframe의 속성으로, 호출된 객체의 크기 또는 길이 반환 
# count() 함수와 비슷한 값을 반환하지만, count()는 함수이고, NaN값을 제외하고 개수 반환
```

In [13]:
user_article_matrix

articleID,ARTICLE_0000,ARTICLE_0001,ARTICLE_0002,ARTICLE_0003,ARTICLE_0004,ARTICLE_0005,ARTICLE_0006,ARTICLE_0007,ARTICLE_0008,ARTICLE_0009,...,ARTICLE_2998,ARTICLE_2999,ARTICLE_3000,ARTICLE_3001,ARTICLE_3002,ARTICLE_3003,ARTICLE_3004,ARTICLE_3005,ARTICLE_3006,ARTICLE_3007
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
USER_0000,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
USER_0001,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
USER_0002,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
USER_0003,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
USER_0004,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
USER_1416,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
USER_1417,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
USER_1418,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
USER_1419,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
# user similarity
user_similarity = cosine_similarity(user_article_matrix)

$$similarity = Cosine(\Theta ) = \frac{A \cdot B}{|A||B|} = \frac{\sum_{i=1}^nA_i \times B_i}{\sqrt{\sum_{i=1}^n(A_i)^2} \times {\sqrt{\sum_{i=1}^n(B_i)^2}}}$$

In [19]:
user_similarity

array([[1.        , 0.        , 0.        , ..., 0.02571722, 0.        ,
        0.01028689],
       [0.        , 1.        , 0.        , ..., 0.        , 0.00847884,
        0.02581989],
       [0.        , 0.        , 1.        , ..., 0.        , 0.06495046,
        0.        ],
       ...,
       [0.02571722, 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.00847884, 0.06495046, ..., 0.        , 1.        ,
        0.        ],
       [0.01028689, 0.02581989, 0.        , ..., 0.        , 0.        ,
        1.        ]])

In [48]:
user_similarity.shape

(1415, 1415)

In [21]:
# recommendation score
user_predicted_scores = user_similarity.dot(user_article_matrix) / np.array([np.abs(user_similarity).sum(axis=1)]).T

```python
user_similarity.dot(user_article_matrix)
# dot()는 내적 함수

np.abs(user_similarity).sum(axis=1)
# 각 user의 유사도의 절대값의 합을 계산

np.array([np.abs(user_similarity).sum(axis=1)]).T
# 그 값을 열 벡터로 변환.
## T는 transpose를 의미
```

Q. user_article_matrix의 모든 값들이 0인데, 어떻게 user_similarity.dot(user_article_matrix) 한 값이 0이 아니야?  
A. cosine_similarity 함수는 vector가 0이 아닌 경우에만 적용되므로, 0인 경우 고려하지 않는다.

In [22]:
user_predicted_scores

array([[0.01635677, 0.03922991, 0.00452033, ..., 0.00937059, 0.02394942,
        0.02622418],
       [0.01945815, 0.00598531, 0.0004365 , ..., 0.01538668, 0.02710833,
        0.00398137],
       [0.03393534, 0.01010633, 0.00108347, ..., 0.06788127, 0.03008129,
        0.0098929 ],
       ...,
       [0.01438303, 0.00844972, 0.00021909, ..., 0.03515472, 0.08179751,
        0.00730041],
       [0.0373851 , 0.00141859, 0.00021209, ..., 0.02351579, 0.06250373,
        0.00406303],
       [0.01883932, 0.02046381, 0.00180259, ..., 0.01480964, 0.02338439,
        0.02576106]])

In [51]:
user_predicted_scores.shape
# (1415,1415) x (1415, 2879)

(1415, 2879)

In [25]:
for idx, user in enumerate(user_article_matrix.index):
    # user's recommedation score (descending order)  
    print("idx: ", idx)
    print("user: ", user)
    sorted_indices = user_predicted_scores[idx].argsort()[::-1]
    top5recommend = [article for article in user_article_matrix.columns[sorted_indices]][:5]
    print("sorted_indices: ", sorted_indices)
    print("top5recommend: ", top5recommend)
    break

idx:  0
user:  USER_0000
sorted_indices:  [ 390  635 1498 ... 2496 2493  657]
top5recommend:  ['ARTICLE_0411', 'ARTICLE_0664', 'ARTICLE_1568', 'ARTICLE_1230', 'ARTICLE_2255']


```python
user_predicted_scores[idx].argsort()[::-1]
# argsort()는 numpy array 정렬함수
```

In [26]:
# Recommendations including articles user has already viewed
recommendations = []
for idx, user in enumerate(user_article_matrix.index):
    # user's recommedation score (descending order)  
    sorted_indices = user_predicted_scores[idx].argsort()[::-1]
    top5recommend = [article for article in user_article_matrix.columns[sorted_indices]][:5]
    
    for article in top5recommend:
        recommendations.append([user, article])

In [29]:
recommendations[:10]

[['USER_0000', 'ARTICLE_0411'],
 ['USER_0000', 'ARTICLE_0664'],
 ['USER_0000', 'ARTICLE_1568'],
 ['USER_0000', 'ARTICLE_1230'],
 ['USER_0000', 'ARTICLE_2255'],
 ['USER_0001', 'ARTICLE_2868'],
 ['USER_0001', 'ARTICLE_2406'],
 ['USER_0001', 'ARTICLE_2493'],
 ['USER_0001', 'ARTICLE_2865'],
 ['USER_0001', 'ARTICLE_2045']]

In [30]:
# submission
top_recommendations = pd.DataFrame(recommendations, columns = ['userID', 'articleID'])

sample_sub['articleID'] = top_recommendations['articleID']

In [31]:
sample_sub.head()

Unnamed: 0,userID,articleID
0,USER_0000,ARTICLE_0411
1,USER_0000,ARTICLE_0664
2,USER_0000,ARTICLE_1568
3,USER_0000,ARTICLE_1230
4,USER_0000,ARTICLE_2255


In [33]:
sample_sub.to_csv('baseline_submission.csv', index=False)

> **Rank** : 52  
> **Score** : 0.29302