In [12]:
import numpy as np
import scipy
import pandas as pd
import math
import random
import sklearn
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse.linalg import svds
import matplotlib.pyplot as plt

In [13]:
UserID =3610000000000000000 # 아이디 ( 입력시 해당아이디에 맞춤제품 추천)

In [14]:
articles_df = pd.read_csv('teaable.csv')

articles_df

Unnamed: 0,차품목,효능번호,카페인,향기준,contentId,"맛,향",personId
0,Bengal Spice,1,X,2,a1,"계피, 생강",1
1,BerryBlossom White,1,O,1,a2,과일향(베리향),2
2,Black Cherry Berry,3,X,1,a3,"과일향(체리, 베리)",3
3,Breakfast in Paris Black Tea,1,O,14,a4,"라벤더꽃향, 시트러스향",4
4,ButterScotch Blondie,3,O,4,a5,"버터카라멜향, 바닐라향",5
...,...,...,...,...,...,...,...
65,요기티 Yogi Tea 스토맥 이즈 티 16티백,4,X,23,a66,"감초, 카다몬씨앗, 회향씨, 페퍼민트잎, 생강뿌리, 후추",66
66,요기티 Yogi Tea 진저 티 16티백,24,,2,a67,허브 약초 생강,67
67,요기티 Yogi Tea 콜드 시즌 티 16티백,4,X,234,a68,"생강뿌리, 감초, 유칼립투스잎, 오렌지껍질, 박하잎, 레몬그라스, 바질잎, 카아몬씨...",68
68,요기티 Yogi Tea 트롯 컴포트 티 허니 레몬 16티백,4,X,1,a69,"누릅나무껍질, 꿀 레몬",69


In [15]:
interaction_df = pd.read_csv('users_interactiontea.csv')
interaction_df.head()

Unnamed: 0,timestamp,eventType,contentId,personId,sessionId,userAgent,userRegion,userCountry
0,1465413032,VIEW,a18,-8.85e+18,1.26e+18,,,
1,1465412560,VIEW,a41,-1.03e+18,3.62e+18,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2...,NY,US
2,1465416190,VIEW,a34,-1.13e+18,2.63e+18,,,
3,1465413895,FOLLOW,a25,3.44e+17,-3.17e+18,,,
4,1465412290,VIEW,a63,-4.45e+17,5.61e+18,,,


In [16]:
event_type_strength = {
   'VIEW': 1.0,
   'LIKE': 2.0, 
   'BOOKMARK': 2.5, 
   'FOLLOW': 3.0,
   'COMMENT CREATED': 4.0,  
}

interaction_df['eventStrength'] = (
  interaction_df
    .loc[:, 'eventType']
    .apply(lambda d: event_type_strength[d])
)

In [17]:
interaction_df_over5 = (interaction_df
  .groupby('personId', group_keys=False)
  .apply(lambda df: df.assign(interactCnt = lambda d: d['contentId'].nunique()))
  .loc[lambda d: d['interactCnt'] >= 5]
)

In [18]:
interaction_full_df = (
  interaction_df_over5
    .groupby(['personId', 'contentId'], as_index=False)['eventStrength']
    .sum()
    .assign(eventScore = lambda d: np.log2(1+d['eventStrength']))
)

interaction_full_df.head(10)

Unnamed: 0,personId,contentId,eventStrength,eventScore
0,-9.22e+18,a1,1.0,1.0
1,-9.22e+18,a23,1.0,1.0
2,-9.22e+18,a30,1.0,1.0
3,-9.22e+18,a58,1.0,1.0
4,-9.22e+18,a66,1.0,1.0
5,-9.17e+18,a14,1.0,1.0
6,-9.17e+18,a20,1.0,1.0
7,-9.17e+18,a21,1.0,1.0
8,-9.17e+18,a23,1.0,1.0
9,-9.17e+18,a4,1.0,1.0


In [19]:
import nltk

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hahaj\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hahaj\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [20]:


vectorizer = TfidfVectorizer(
    analyzer='word',
    ngram_range=(1, 2),
    min_df=0.003,
    max_df=0.5,
    max_features=1000,
    
)

In [22]:
item_ids = articles_df['contentId'].tolist()
tfidf_matrix = vectorizer.fit_transform(articles_df['맛,향'])
tfidf_feature_names = vectorizer.get_feature_names()

In [23]:
tfidf_matrix

<70x189 sparse matrix of type '<class 'numpy.float64'>'
	with 281 stored elements in Compressed Sparse Row format>

In [24]:
def get_item_profile(item_id):
    idx = item_ids.index(item_id)
    item_profile = tfidf_matrix[idx:idx+1]
    return item_profile

def get_item_profiles(ids):
    item_profiles_list = [get_item_profile(x) for x in ids]
    item_profiles = scipy.sparse.vstack(item_profiles_list)
    return item_profiles

def build_user_profile(person_id, interaction_indexed_df):
    interaction_person_df = interaction_indexed_df.loc[person_id]
    user_item_profiles = get_item_profiles(interaction_person_df['contentId'])
    
    user_item_strengths = np.array(interaction_person_df['eventStrength']).reshape(-1, 1)
    
    # 상호작용 강도를 바탕으로 가중치를 부여하여 평균 계산한다
    user_item_strengths_weighted_avg = \
        np.sum(user_item_profiles.multiply(user_item_strengths), axis=0) /\
        np.sum(user_item_strengths)
        
    user_profile_norm = sklearn.preprocessing.normalize(user_item_strengths_weighted_avg)
    return user_profile_norm

def build_user_profiles():
    interaction_indexed_df = (interaction_full_df
        .loc[lambda d: d['contentId'].isin(articles_df['contentId'])]
        .set_index('personId')
    )
    user_profiles = {}
    
    for person_id in interaction_indexed_df.index.unique():
        user_profiles[person_id] = build_user_profile(person_id, interaction_indexed_df)
        
    return user_profiles

In [25]:
user_profiles = build_user_profiles()
len(user_profiles)

397

In [26]:
myprofile = user_profiles[UserID].flatten().tolist()
A= pd.DataFrame(sorted(zip(tfidf_feature_names, myprofile), key=lambda x: -x[1])[:20],
             columns=['token', 'relevance'])
A

Unnamed: 0,token,relevance
0,과일향,0.34686
1,계피,0.312246
2,시트러스향,0.20696
3,바닐라향,0.188931
4,복숭아향,0.187116
5,홍차향,0.181298
6,캐모마일향,0.180022
7,복숭아,0.175093
8,생강,0.173872
9,베리향,0.156172


In [27]:
interaction_train, interaction_test = train_test_split(
    interaction_full_df,
    stratify=interaction_full_df['personId'],
    test_size=0.2,
    random_state=42
)

In [28]:
interaction_train_indexed = interaction_train.set_index('personId')

In [33]:
class ContentBasedRecommender:
    
    MODEL_NAME = 'Content-Based'
    
    def __init__(self, item_ids, items_df=None):
        self.item_ids = item_ids
        self.items_df = items_df
        
    def get_model_name(self):
        return self.MODEL_NAME
    
    def _get_similar_items_to_user_profile(self, person_id, topn=1000):
        # 유저 특성과 항목 특성 사이의 코사인 유사도를 구한다
        cosine_similarities = cosine_similarity(user_profiles[person_id], tfidf_matrix)
        
        # 가장 유사한 항목을 찾는다
        similar_indices = cosine_similarities.argsort().flatten()[-topn:]
        
        # 유사도를 기준으로 유사한 항목을 정렬한다
        similar_items = sorted(
            [(item_ids[i], cosine_similarities[0, i]) for i in similar_indices],
            key=lambda x: -x[1]
        )
        
        return similar_items
    
    
    def get_items_interacted(person_id, interaction_df):
        interated_items = interaction_df.loc[person_id]['contentId']
    
        return set(interated_items if type(interated_items) == pd.Series else [interated_items])

        

    
    def recommend_items(self, user_id, items_to_ignore=[], topn=10, verbose=False):
        similar_items = self._get_similar_items_to_user_profile(user_id)
        
       
        
        # 기존에 상호작용했던 항목은 제거한다
        similar_items_filtered = list(filter(lambda x: x[0] not in items_to_ignore, similar_items))
        
        recommendations = (
            pd.DataFrame(similar_items_filtered, columns=['contentId', 'recStrength'])
              .head(topn)
        )
        
        if verbose:
            if self.items_df is None:
                raise Exception('"items_df" is required in verbose mode')
            recommendations = (recommendations
                .merge(self.items_df, how='left', left_on='contentId', right_on='contentId')
                .loc[:, ['recStrength', 'contentId', 'title', 'url', 'lang']]
            )
        
        return recommendations

In [34]:
content_based_model = ContentBasedRecommender(item_ids, articles_df)

In [35]:
print(content_based_model.recommend_items( UserID, topn=10, verbose=False))

  contentId  recStrength
0       a58     0.346860
1       a28     0.335571
2        a1     0.313791
3       a13     0.310045
4        a2     0.306085
5       a53     0.302607
6       a51     0.282655
7       a48     0.269115
8       a56     0.267978
9       a30     0.266913
