In [1]:
import pandas as pd
import numpy as np

In [2]:
article_info = pd.read_csv("../input/dacon-data/article_info.csv")
view_log = pd.read_csv("../input/dacon-data/view_log.csv")
sample_sub = pd.read_csv("../input/dacon-data/sample_submission.csv")

In [3]:
article_info.head()

Unnamed: 0,articleID,Title,Content,Format,Language,userID,userCountry,userRegion
0,ARTICLE_0000,19 Tips For Everyday Git Use,I've been using git full time for the past 4 y...,HTML,en,USER_0683,,
1,ARTICLE_0001,Intel buys computer vision startup Itseez to i...,Intel has acquired computer vision and machine...,HTML,en,USER_1129,,
2,ARTICLE_0002,Practical End-to-End Testing with Protractor,One of the reasons AngularJS is so great to wo...,HTML,en,USER_0256,,
3,ARTICLE_0003,Corporate venture growth in Brazil is another ...,Despite recent positive news and a renewed int...,HTML,en,USER_1304,,
4,ARTICLE_0004,Cross-channel user experiences with Drupal (aw...,"Last year around this time, I wrote that The B...",HTML,en,USER_0336,,


In [4]:
view_log.head(5)

Unnamed: 0,userID,articleID,userRegion,userCountry
0,USER_0000,ARTICLE_0661,NY,US
1,USER_0000,ARTICLE_2316,NY,US
2,USER_0000,ARTICLE_1345,NY,US
3,USER_0000,ARTICLE_1089,NY,US
4,USER_0000,ARTICLE_1484,NY,US


In [5]:
sample_sub.head()

Unnamed: 0,userID,articleID
0,USER_0000,ARTICLE_0000
1,USER_0000,ARTICLE_0001
2,USER_0000,ARTICLE_0002
3,USER_0000,ARTICLE_0003
4,USER_0000,ARTICLE_0004


## Word2Vec

___

### Tutorial

**TfidfVectorizer**  
***Tf*** : Term Frequency. 하나의 문서(문장)에서 특정 단어가 등장하는 횟수  
***Idf*** : Inverse Document Frequency. 특정 단어가 몇 개의 문서(문장)에서 등장하는지를 수치화 한 것이 Df. 그것의 역수가 idf다. 역수 개념을 사용하는 이유는, 적은 문서(문장)에 등장할수록 큰 숫자가 되게 하고,반대로 많은 문서(문장)에 등장할수록 숫자를 작아지게 함으로써 여러 문서(문장)에 의미 없이 사용되는 단어의 가중치를 줄이기 위해서다. $$idf(d,t) = log(\frac{n}{1+df(t)})$$  
***Tf-idf*** 수치는 Tf값과 Idf값을 고하여 구한다. 해당 연산을 거친 최종 Tf-idf 값은 0과 1사이로 만들어진다.

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = ['you know I want your love', 'I like you', 'what should I do']
tfidfv = TfidfVectorizer().fit(corpus)
print(tfidfv.vocabulary_)
print("="*100)
print(tfidfv.transform(corpus).toarray())
print("="*100)
print(tfidfv.fit_transform(corpus).toarray()) # fit(학습)과 transform(변환)을 한번에

{'you': 7, 'know': 1, 'want': 5, 'your': 8, 'love': 3, 'like': 2, 'what': 6, 'should': 4, 'do': 0}
[[0.         0.46735098 0.         0.46735098 0.         0.46735098
  0.         0.35543247 0.46735098]
 [0.         0.         0.79596054 0.         0.         0.
  0.         0.60534851 0.        ]
 [0.57735027 0.         0.         0.         0.57735027 0.
  0.57735027 0.         0.        ]]
[[0.         0.46735098 0.         0.46735098 0.         0.46735098
  0.         0.35543247 0.46735098]
 [0.         0.         0.79596054 0.         0.         0.
  0.         0.60534851 0.        ]
 [0.57735027 0.         0.         0.         0.57735027 0.
  0.57735027 0.         0.        ]]


TfidfVectorizer.fit(text)를 통해 text가 가지고 있는 모든 단어를 BoW로 구성하고, 이 단어들에 대해 Tf-idf 값을 계산한 뒤 각 단어의 인덱스 위치에 Tf-idf 값이 들어간 벡터가 만들어진다. 이 과정에서 CountVectorizer와 마찬가지로 I, a 등 한 글자 단어는 사라진다. 특정 단어를 가지고 있지 않다면 Tf = 0 이므로 Tf-idf도 0으로 표현됨을 알 수 있다.

In [7]:
from sklearn.metrics.pairwise import linear_kernel
tfidf_matrix = tfidfv.fit_transform(corpus)
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
print(cosine_sim)

[[1.         0.21516051 0.        ]
 [0.21516051 1.         0.        ]
 [0.         0.         1.        ]]


embedding을 마치고 나면, 위와 같이 문서(문장)간 유사도도 수치화할 수 있다.

---

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
import gensim

import warnings
warnings.filterwarnings(action='ignore')

In [9]:
article_info.columns

Index(['articleID', 'Title', 'Content', 'Format', 'Language', 'userID',
       'userCountry', 'userRegion'],
      dtype='object')

In [10]:
view_log = pd.merge(view_log, article_info[['articleID', 'Title']], how = 'left', on = 'articleID')

In [11]:
view_log.head(50)

Unnamed: 0,userID,articleID,userRegion,userCountry,Title
0,USER_0000,ARTICLE_0661,NY,US,Facebook says its new AI is almost as good as ...
1,USER_0000,ARTICLE_2316,NY,US,Study Shows Women and Minorities Are Punished ...
2,USER_0000,ARTICLE_1345,NY,US,Google's head of advertising talks ad blocking...
3,USER_0000,ARTICLE_1089,NY,US,7 Questions to Ask Before Your Next Digital Tr...
4,USER_0000,ARTICLE_1484,NY,US,Acquia Engage Awards Finalists Announced
5,USER_0000,ARTICLE_1033,NY,US,Google believes its superior AI will be the ke...
6,USER_0000,ARTICLE_1033,NY,US,Google believes its superior AI will be the ke...
7,USER_0000,ARTICLE_2255,NY,US,Equal Pay Day in the spotlight this year
8,USER_0000,ARTICLE_1260,NY,US,Amazon goes open source with machine-learning ...
9,USER_0000,ARTICLE_0090,NY,US,The emerging Darwinian approach to analytics a...


In [12]:
view_log[view_log['userID'] == 'USER_0000']

Unnamed: 0,userID,articleID,userRegion,userCountry,Title
0,USER_0000,ARTICLE_0661,NY,US,Facebook says its new AI is almost as good as ...
1,USER_0000,ARTICLE_2316,NY,US,Study Shows Women and Minorities Are Punished ...
2,USER_0000,ARTICLE_1345,NY,US,Google's head of advertising talks ad blocking...
3,USER_0000,ARTICLE_1089,NY,US,7 Questions to Ask Before Your Next Digital Tr...
4,USER_0000,ARTICLE_1484,NY,US,Acquia Engage Awards Finalists Announced
5,USER_0000,ARTICLE_1033,NY,US,Google believes its superior AI will be the ke...
6,USER_0000,ARTICLE_1033,NY,US,Google believes its superior AI will be the ke...
7,USER_0000,ARTICLE_2255,NY,US,Equal Pay Day in the spotlight this year
8,USER_0000,ARTICLE_1260,NY,US,Amazon goes open source with machine-learning ...
9,USER_0000,ARTICLE_0090,NY,US,The emerging Darwinian approach to analytics a...


In [13]:
len(view_log[view_log['userID'] == 'USER_0000'].Title.unique())

25

In [14]:
agg_view_log = view_log.groupby(['userID'])['Title'].agg({'unique'})
agg_view_log.head()

Unnamed: 0_level_0,unique
userID,Unnamed: 1_level_1
USER_0000,[Facebook says its new AI is almost as good as...
USER_0001,[These Are The 10 Most Purchased Brands in the...
USER_0002,[Par de alianças impede que seu parceiro assis...
USER_0003,[A importância dos filmes de mulherzinha - Cap...
USER_0004,[Governo brasileiro cria manual para contrataç...


In [15]:
for user_sentence in agg_view_log['unique'].values:
    print(user_sentence)
    break

['Facebook says its new AI is almost as good as humans at understanding context'
 'Study Shows Women and Minorities Are Punished for Speaking Up About Workplace Diversity'
 "Google's head of advertising talks ad blocking, mobile and micropayments"
 '7 Questions to Ask Before Your Next Digital Transformation'
 'Acquia Engage Awards Finalists Announced'
 'Google believes its superior AI will be the key to its future'
 'Equal Pay Day in the spotlight this year'
 "Amazon goes open source with machine-learning tech, competing with Google's TensorFlow - GeekWire"
 'The emerging Darwinian approach to analytics and augmented intelligence'
 'Vídeo impactante faz você sentir na pele o preconceito contra LGBTs'
 'Google is discontinuing Google+ Hangouts On Air on September 12, pushes users to YouTube Live'
 'Airbnb bets on local with user-generated guidebooks and new neighborhood/home matching'
 'Google I/O 2016 Preview: Machine Learning, Virtual Reality And Android N - ARC'
 "Shopping app Spring

In [16]:
len(user_sentence)

25

**Word2Vec 적용**

In [17]:
# int 형식은 Word2Vec에서 학습이 안되어서 String으로 변환해줘야 함
sentence = []
for user_sentence in agg_view_log['unique'].values:
    sentence.append(list(map(str, user_sentence)))

In [18]:
len(view_log['Title'].unique())

2879

In [19]:
len(sentence)

1415

```
의문  
word2vec을 학습시킬 때, 여기서는 각 user가 본 기사들을 하나의 리스트로 만들어서 sentence 리스트트의 한 요소로 추가해서 학습시켰는데  
이것과  
view_log['Title'].unique()의 값들 즉, 중복 제거한 기사 타이틀들을 학습시키는 것과의 차이점은?  
  
word2vec이 비슷한 의미를 가진 단어일 수록 embedding vector가 가까운 위치에 분포하도록 해야하니까 동일 user가 본 기사들을 묶어서 학습시켜야하는건가?
```

In [20]:
# Word2Vec 학습
from gensim.models import Word2Vec
embedding_model = Word2Vec(sentence, vector_size=20, window=5, min_count=1, workers=4, epochs=200, sg=1)

*'TypeError: Word2Vec.__init__() got an unexpected keyword argument'*  
`size` -> `vector_size`  
`iter` -> `epochs`

In [21]:
embedding_model.wv.most_similar(positive=["Google's head of advertising talks ad blocking, mobile and micropayments"], topn=10)

[('Management theory is becoming a compendium of dead ideas',
  0.8026271462440491),
 ('Do you want Crappy Agile?', 0.7741585969924927),
 ('PayPal is shutting down its Windows Phone, BlackBerry, and Amazon apps',
  0.7632311582565308),
 ("Mike Birbiglia's 6 Tips for Making It Small in Hollywood. Or Anywhere.",
  0.7544146776199341),
 ("Feeding 10 Billion People - Land O'Lakes & CTP", 0.7530540823936462),
 ('Que novas oportunidades poderemos criar com Blockchain? - CIO',
  0.7481043934822083),
 ('Use Poka-Yoke Technique to Improve Software Quality', 0.7476027607917786),
 ('Big IT Rising', 0.7372493147850037),
 ('The Startup Pivot Pyramid - How to Pivot Your Startup the Right Away Using Growth Marketing Mindset',
  0.7208951711654663),
 ("Breaking: Apple's Search Ads go LIVE!", 0.720363974571228)]

In [24]:
len(sample_sub)/5

1415.0

In [50]:
grouped_titles = view_log.groupby('userID')['Title'].apply(list).reset_index()
print(grouped_titles.head())

      userID                                              Title
0  USER_0000  [Facebook says its new AI is almost as good as...
1  USER_0001  [These Are The 10 Most Purchased Brands in the...
2  USER_0002  [Par de alianças impede que seu parceiro assis...
3  USER_0003  [A importância dos filmes de mulherzinha - Cap...
4  USER_0004  [Governo brasileiro cria manual para contrataç...


In [58]:
f_df = grouped_titles[grouped_titles['userID'] == 'USER_0000']['Title']

for index, value in f_df.items():
    print(value)

['Facebook says its new AI is almost as good as humans at understanding context', 'Study Shows Women and Minorities Are Punished for Speaking Up About Workplace Diversity', "Google's head of advertising talks ad blocking, mobile and micropayments", '7 Questions to Ask Before Your Next Digital Transformation', 'Acquia Engage Awards Finalists Announced', 'Google believes its superior AI will be the key to its future', 'Google believes its superior AI will be the key to its future', 'Equal Pay Day in the spotlight this year', "Amazon goes open source with machine-learning tech, competing with Google's TensorFlow - GeekWire", 'The emerging Darwinian approach to analytics and augmented intelligence', 'Equal Pay Day in the spotlight this year', 'Vídeo impactante faz você sentir na pele o preconceito contra LGBTs', 'Google is discontinuing Google+ Hangouts On Air on September 12, pushes users to YouTube Live', 'Airbnb bets on local with user-generated guidebooks and new neighborhood/home matc

In [62]:
for idx, row in grouped_titles.iterrows():
    print(row['Title'])
    break

['Facebook says its new AI is almost as good as humans at understanding context', 'Study Shows Women and Minorities Are Punished for Speaking Up About Workplace Diversity', "Google's head of advertising talks ad blocking, mobile and micropayments", '7 Questions to Ask Before Your Next Digital Transformation', 'Acquia Engage Awards Finalists Announced', 'Google believes its superior AI will be the key to its future', 'Google believes its superior AI will be the key to its future', 'Equal Pay Day in the spotlight this year', "Amazon goes open source with machine-learning tech, competing with Google's TensorFlow - GeekWire", 'The emerging Darwinian approach to analytics and augmented intelligence', 'Equal Pay Day in the spotlight this year', 'Vídeo impactante faz você sentir na pele o preconceito contra LGBTs', 'Google is discontinuing Google+ Hangouts On Air on September 12, pushes users to YouTube Live', 'Airbnb bets on local with user-generated guidebooks and new neighborhood/home matc

### user가 본 기사들의 vector 평균으로 추천하기

In [74]:
all_recommended_items = []
for idx, row in grouped_titles.iterrows():
    sentence_vectors = []
    for sentence in row['Title']:
        sentence_vectors.append(embedding_model.wv[sentence])
    average_vector = np.mean(sentence_vectors, axis=0)
    similar_items = embedding_model.wv.similar_by_vector(average_vector, topn=5)
    recommended_items = [item[0] for item in similar_items]
    all_recommended_items.append((row['userID'], recommended_items))

In [81]:
recommended_df = pd.DataFrame(all_recommended_items, columns=['userID', 'Title'])
recommended_df.set_index('userID', inplace=True)
print(recommended_df.head())

                                                       Title
userID                                                      
USER_0000  [Google's head of advertising talks ad blockin...
USER_0001  [7 Vital SEO Trends for Google Rankings in 201...
USER_0002  [Código Google: Introdução da próxima geração ...
USER_0003  [Building A Better Workforce: 5 Talent Trends ...
USER_0004  [The Continuous Delivery Maturity Model, GitLa...


### Method 1 : Using 'merge'

In [89]:
flat_recommended_items = []
for idx, items in recommended_df['Title'].items():
    for item in items:
        flat_recommended_items.append((idx, item))

## Flatten the recommended items into a DataFrame
flat_recommended_df = pd.DataFrame(flat_recommended_items, columns=['userID', 'Title'])

# Mmerge to get aritlceID
final_recommended_df = flat_recommended_df.merge(article_info[['articleID', 'Title']], on='Title', how='left')

In [90]:
final_recommended_df.head(10)

Unnamed: 0,userID,Title,articleID
0,USER_0000,Google's head of advertising talks ad blocking...,ARTICLE_1345
1,USER_0000,Airbnb bets on local with user-generated guide...,ARTICLE_1769
2,USER_0000,Requiem for a Dream (2000),ARTICLE_0430
3,USER_0000,The Spotify Tribe,ARTICLE_1616
4,USER_0000,Embracing Agile,ARTICLE_0411
5,USER_0001,7 Vital SEO Trends for Google Rankings in 2017...,ARTICLE_2087
6,USER_0001,How Coca-Cola Is Harvesting Innovation Energy ...,ARTICLE_2958
7,USER_0001,The Ultimate Guide to SEO Reporting: Starting ...,ARTICLE_0142
8,USER_0001,Ray Kurzweil: The world isn't getting worse - ...,ARTICLE_1568
9,USER_0001,These Are The 10 Most Purchased Brands in the ...,ARTICLE_1408


In [91]:
# Optionally, you can pivot back to the original structure if needed
pivot_recommended_df = final_recommended_df.pivot_table(index='userID', values='articleID', aggfunc=list)
pivot_recommended_df.head(10)

Unnamed: 0_level_0,articleID
userID,Unnamed: 1_level_1
USER_0000,"[ARTICLE_1345, ARTICLE_1769, ARTICLE_0430, ART..."
USER_0001,"[ARTICLE_2087, ARTICLE_2958, ARTICLE_0142, ART..."
USER_0002,"[ARTICLE_2803, ARTICLE_2674, ARTICLE_1108, ART..."
USER_0003,"[ARTICLE_2545, ARTICLE_0958, ARTICLE_1918, ART..."
USER_0004,"[ARTICLE_1947, ARTICLE_2999, ARTICLE_1489, ART..."
USER_0005,"[ARTICLE_2589, ARTICLE_2223, ARTICLE_0336, ART..."
USER_0006,"[ARTICLE_1474, ARTICLE_2389, ARTICLE_1717, ART..."
USER_0007,"[ARTICLE_0878, ARTICLE_0699, ARTICLE_0979, ART..."
USER_0008,"[ARTICLE_2012, ARTICLE_1924, ARTICLE_2541, ART..."
USER_0009,"[ARTICLE_1587, ARTICLE_0590, ARTICLE_2165, ART..."


### Method 2 : Using 'set_index' and 'loc'

In [92]:
# Set article Title as index in article_info
article_info.set_index('Title', inplace=True)

In [112]:
# Function to get articleID for a given Title
def get_article_ids(titles):
    try:
        return [article_info.loc[title]['articleID'] for title in titles]
    except KeyError:
        return []

In [113]:
recommended_with_ids = []

for idx, row in recommended_df.iterrows():
    item_ids = get_article_ids(row['Title'])
    recommended_with_ids.append((idx, item_ids))

# Convert to DataFrame
recommended_with_ids_df = pd.DataFrame(recommended_with_ids, columns=['userID', 'articleID'])
recommended_with_ids_df.set_index('userID', inplace=True)

recommended_with_ids_df.head(10)

Unnamed: 0_level_0,articleID
userID,Unnamed: 1_level_1
USER_0000,"[ARTICLE_1345, ARTICLE_1769, ARTICLE_0430, ART..."
USER_0001,"[ARTICLE_2087, ARTICLE_2958, ARTICLE_0142, ART..."
USER_0002,"[ARTICLE_2803, ARTICLE_2674, ARTICLE_1108, ART..."
USER_0003,"[ARTICLE_2545, ARTICLE_0958, ARTICLE_1918, ART..."
USER_0004,"[ARTICLE_1947, ARTICLE_2999, ARTICLE_1489, ART..."
USER_0005,"[ARTICLE_2589, ARTICLE_2223, ARTICLE_0336, ART..."
USER_0006,"[ARTICLE_1474, ARTICLE_2389, ARTICLE_1717, ART..."
USER_0007,"[ARTICLE_0878, ARTICLE_0699, ARTICLE_0979, ART..."
USER_0008,"[ARTICLE_2012, ARTICLE_1924, ARTICLE_2541, ART..."
USER_0009,"[ARTICLE_1587, ARTICLE_0590, ARTICLE_2165, ART..."


In [119]:
recommended_with_ids_df['articleID']['USER_0000']

['ARTICLE_1345',
 'ARTICLE_1769',
 'ARTICLE_0430',
 'ARTICLE_1616',
 'ARTICLE_0411']

In [121]:
exploded_df = recommended_with_ids_df['articleID'].explode('articleID').reset_index(drop=True)
exploded_df.columns = ['userID', 'articleID']
exploded_df.head()

0    ARTICLE_1345
1    ARTICLE_1769
2    ARTICLE_0430
3    ARTICLE_1616
4    ARTICLE_0411
Name: articleID, dtype: object

In [83]:
# submission
top_recommendations = pd.DataFrame(recommendations, columns = ['userID', 'articleID'])

sample_sub['articleID'] = top_recommendations['articleID']

NameError: name 'recommendations' is not defined

recommended_with_ids_df의 정체를 모르겠다.. 'userID'와 'articleID'를 두 열로 가지는 데이터프레임은 아닌 것 같은데...

---

In [126]:
final_recommended_df[['userID', 'articleID']]

Unnamed: 0,userID,articleID
0,USER_0000,ARTICLE_1345
1,USER_0000,ARTICLE_1769
2,USER_0000,ARTICLE_0430
3,USER_0000,ARTICLE_1616
4,USER_0000,ARTICLE_0411
...,...,...
7070,USER_1420,ARTICLE_1711
7071,USER_1420,ARTICLE_0714
7072,USER_1420,ARTICLE_0030
7073,USER_1420,ARTICLE_2622


In [128]:
final_recommended_df.to_csv('word2vec(1)_submission.csv', index=False)

> **Rank** : -  
> **Score** : 0.22702