In [312]:
import numpy as np
import scipy
import pandas as pd
import math
import random
import sklearn
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse.linalg import svds
import matplotlib.pyplot as plt

In [313]:

UserID =8240000000000000000

In [314]:
articles_df = pd.read_csv('teaable.csv')

articles_df

Unnamed: 0,차품목,효능번호,카페인,향기준,contentId,"맛,향",personId
0,Bengal Spice,1,X,2,a1,"계피, 생강",1
1,BerryBlossom White,1,O,1,a2,과일향(베리향),2
2,Black Cherry Berry,3,X,1,a3,"과일향(체리, 베리)",3
3,Breakfast in Paris Black Tea,1,O,14,a4,"라벤더꽃향, 시트러스향",4
4,ButterScotch Blondie,3,O,4,a5,"버터카라멜향, 바닐라향",5
...,...,...,...,...,...,...,...
65,요기티 Yogi Tea 스토맥 이즈 티 16티백,4,X,23,a66,"감초, 카다몬씨앗, 회향씨, 페퍼민트잎, 생강뿌리, 후추",66
66,요기티 Yogi Tea 진저 티 16티백,24,,2,a67,허브 약초 생강,67
67,요기티 Yogi Tea 콜드 시즌 티 16티백,4,X,234,a68,"생강뿌리, 감초, 유칼립투스잎, 오렌지껍질, 박하잎, 레몬그라스, 바질잎, 카아몬씨...",68
68,요기티 Yogi Tea 트롯 컴포트 티 허니 레몬 16티백,4,X,1,a69,"누릅나무껍질, 꿀 레몬",69


In [315]:
interaction_df = pd.read_csv('users_interactiontea.csv')
interaction_df.head()

Unnamed: 0,timestamp,eventType,contentId,personId,sessionId,userAgent,userRegion,userCountry
0,1465413032,VIEW,a18,-8.85e+18,1.26e+18,,,
1,1465412560,VIEW,a41,-1.03e+18,3.62e+18,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2...,NY,US
2,1465416190,VIEW,a34,-1.13e+18,2.63e+18,,,
3,1465413895,FOLLOW,a25,3.44e+17,-3.17e+18,,,
4,1465412290,VIEW,a63,-4.45e+17,5.61e+18,,,


In [316]:
event_type_strength = {
   'VIEW': 1.0,
   'LIKE': 2.0, 
   'BOOKMARK': 2.5, 
   'FOLLOW': 3.0,
   'COMMENT CREATED': 4.0,  
}

interaction_df['eventStrength'] = (
  interaction_df
    .loc[:, 'eventType']
    .apply(lambda d: event_type_strength[d])
)

In [317]:

interaction_df_over5 = (interaction_df
  .groupby('personId', group_keys=False)
  .apply(lambda df: df.assign(interactCnt = lambda d: d['contentId'].nunique()))
  .loc[lambda d: d['interactCnt'] >= 5]
)

In [318]:
interaction_full_df = (
  interaction_df_over5
    .groupby(['personId', 'contentId'], as_index=False)['eventStrength']
    .sum()
    .assign(eventScore = lambda d: np.log2(1+d['eventStrength']))
)

interaction_full_df.head(10)

Unnamed: 0,personId,contentId,eventStrength,eventScore
0,-9.22e+18,a1,1.0,1.0
1,-9.22e+18,a23,1.0,1.0
2,-9.22e+18,a30,1.0,1.0
3,-9.22e+18,a58,1.0,1.0
4,-9.22e+18,a66,1.0,1.0
5,-9.17e+18,a14,1.0,1.0
6,-9.17e+18,a20,1.0,1.0
7,-9.17e+18,a21,1.0,1.0
8,-9.17e+18,a23,1.0,1.0
9,-9.17e+18,a4,1.0,1.0


In [319]:
import nltk

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jeony\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jeony\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [320]:
vectorizer = TfidfVectorizer(
    analyzer='word',
    ngram_range=(1, 2),
    min_df=0.003,
    max_df=0.5,
    max_features=1000,
    
)

In [321]:
item_ids = articles_df['contentId'].tolist()
tfidf_matrix = vectorizer.fit_transform(articles_df['맛,향'])
tfidf_feature_names = vectorizer.get_feature_names()

In [322]:
tfidf_matrix


<70x189 sparse matrix of type '<class 'numpy.float64'>'
	with 281 stored elements in Compressed Sparse Row format>

In [323]:
def get_item_profile(item_id):
    idx = item_ids.index(item_id)
    item_profile = tfidf_matrix[idx:idx+1]
    return item_profile

def get_item_profiles(ids):
    item_profiles_list = [get_item_profile(x) for x in ids]
    item_profiles = scipy.sparse.vstack(item_profiles_list)
    return item_profiles

def build_user_profile(person_id, interaction_indexed_df):
    interaction_person_df = interaction_indexed_df.loc[person_id]
    user_item_profiles = get_item_profiles(interaction_person_df['contentId'])
    
    user_item_strengths = np.array(interaction_person_df['eventStrength']).reshape(-1, 1)
    
    # 상호작용 강도를 바탕으로 가중치를 부여하여 평균 계산한다
    user_item_strengths_weighted_avg = \
        np.sum(user_item_profiles.multiply(user_item_strengths), axis=0) /\
        np.sum(user_item_strengths)
        
    user_profile_norm = sklearn.preprocessing.normalize(user_item_strengths_weighted_avg)
    return user_profile_norm

def build_user_profiles():
    interaction_indexed_df = (interaction_full_df
        .loc[lambda d: d['contentId'].isin(articles_df['contentId'])]
        .set_index('personId')
    )
    user_profiles = {}
    
    for person_id in interaction_indexed_df.index.unique():
        user_profiles[person_id] = build_user_profile(person_id, interaction_indexed_df)
        
    return user_profiles

In [324]:
user_profiles = build_user_profiles()
len(user_profiles)

397

In [325]:
myprofile = user_profiles[UserID].flatten().tolist()
A= pd.DataFrame(sorted(zip(tfidf_feature_names, myprofile), key=lambda x: -x[1])[:20],
             columns=['token', 'relevance'])
A

Unnamed: 0,token,relevance
0,복숭아,0.413176
1,스피아민트,0.413176
2,민트,0.227303
3,홍차향,0.218815
4,복숭아향,0.206588
5,계피,0.20052
6,시원한,0.147
7,계피 홍차향,0.146531
8,견과향,0.13585
9,견과향 과일향,0.13585


In [326]:
class ContentBasedRecommender:
    
    MODEL_NAME = 'Content-Based'
    
    def __init__(self, item_ids, items_df=None):
        self.item_ids = item_ids
        self.items_df = items_df
        
    def get_model_name(self):
        return self.MODEL_NAME
    
    def _get_similar_items_to_user_profile(self, person_id, topn=1000):
        # 유저 특성과 항목 특성 사이의 코사인 유사도를 구한다
        cosine_similarities = cosine_similarity(user_profiles[person_id], tfidf_matrix)
        
        # 가장 유사한 항목을 찾는다
        similar_indices = cosine_similarities.argsort().flatten()[-topn:]
        
        # 유사도를 기준으로 유사한 항목을 정렬한다
        similar_items = sorted(
            [(item_ids[i], cosine_similarities[0, i]) for i in similar_indices],
            key=lambda x: -x[1]
        )
        
        return similar_items
    
    def recommend_items(self, user_id, items_to_ignore=[], topn=10, verbose=False):
        similar_items = self._get_similar_items_to_user_profile(user_id)
        
        # 기존에 상호작용했던 항목은 제거한다
        similar_items_filtered = list(filter(lambda x: x[0] not in items_to_ignore, similar_items))
        
        recommendations = (
            pd.DataFrame(similar_items_filtered, columns=['contentId', 'recStrength'])
              .head(topn)
        )
        
        if verbose:
            if self.items_df is None:
                raise Exception('"items_df" is required in verbose mode')
            recommendations = (recommendations
                .merge(self.items_df, how='left', left_on='contentId', right_on='contentId')
                .loc[:, ['recStrength', 'contentId', 'title', 'url', 'lang']]
            )
        
        return recommendations

In [327]:
content_based_model = ContentBasedRecommender(item_ids, articles_df)

In [328]:
#print(content_based_model.recommend_items( UserID, topn=10, verbose=False))

  contentId  recStrength
0       a40     0.413176
1       a64     0.413176
2       a70     0.413176
3       a13     0.311981
4       a32     0.268758
5       a26     0.264239
6       a41     0.263894
7       a15     0.224592
8       a50     0.220188
9       a31     0.218815


In [340]:
'''
print('콘텐츠 기반 추천 모형을 평가합니다')
cb_global_metrics, cb_detailed_results = model_evaluator.evaluate_model(content_based_model)
print('Global Metrics:\n{}'.format(cb_global_metrics))
cb_detailed_results.head(10)
'''

"\nprint('콘텐츠 기반 추천 모형을 평가합니다')\ncb_global_metrics, cb_detailed_results = model_evaluator.evaluate_model(content_based_model)\nprint('Global Metrics:\n{}'.format(cb_global_metrics))\ncb_detailed_results.head(10)\n"

In [341]:
users_items_pivot_df = (interaction_train
  .pivot(index='personId', columns='contentId', values='eventStrength')
  .fillna(0)
)

In [342]:
users_items_pivot_df.iloc[:5, :5]

contentId,a1,a10,a11,a12,a13
personId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
-9.22e+18,1.0,0.0,0.0,0.0,0.0
-9.17e+18,0.0,0.0,0.0,0.0,0.0
-9.16e+18,0.0,0.0,0.0,0.0,1.0
-9.11e+18,0.0,0.0,0.0,0.0,0.0
-9.06e+18,0.0,0.0,0.0,0.0,0.0


In [343]:

users_items_pivot_matrix = users_items_pivot_df.values
users_items_pivot_matrix[:10]
#The error TypeError: 'numpy.ndarray' object is not callable means that you tried to call a numpy array as a function.


array([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
        0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 1., 0., 1.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
        0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
        4., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
        0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0

In [344]:
# User-Item matrix에서 요인의 개수를 정한다
NUMBER_OF_FACTORS_MF = 15

# User-Item Matrix을 분해한다
U, sigma, Vt = svds(users_items_pivot_matrix, k=NUMBER_OF_FACTORS_MF)

In [345]:
U.shape # (1140, 15)
Vt.shape # (15, 2926)

sigma_mat = np.diag(sigma)
sigma_mat.shape # (15, 15)


(15, 15)

In [346]:
all_user_predicted_ratings = np.dot(np.dot(U, sigma_mat), Vt)
all_user_predicted_ratings

array([[ 0.26746268,  0.06704313,  0.06368426, ...,  0.19455052,
         0.08644696,  0.1927236 ],
       [ 0.13392597, -0.01762061,  0.10159447, ...,  0.30209132,
        -0.06555318,  0.23225892],
       [ 0.05919402,  0.08537293,  0.13488527, ...,  0.20250097,
        -0.00692742, -0.33672833],
       ...,
       [-0.07052776, -0.134407  , -0.03031464, ...,  0.2308758 ,
         0.260586  ,  0.59194663],
       [ 0.24338753, -0.19691057,  0.08066608, ...,  0.42185333,
         0.06913447,  0.07397796],
       [ 0.0561989 ,  0.02030964,  0.08036277, ...,  0.11285926,
        -0.09593488, -0.05293074]])

In [347]:
cf_preds_df = (
  pd.DataFrame(all_user_predicted_ratings, 
               columns=users_items_pivot_df.columns, 
               index=user_ids)
    .transpose()
)

In [348]:
cf_preds_df.iloc[:5, :5]

Unnamed: 0_level_0,-9.220000e+18,-9.170000e+18,-9.160000e+18,-9.110000e+18,-9.060000e+18
contentId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
a1,0.267463,0.133926,0.059194,0.091688,-0.14063
a10,0.067043,-0.017621,0.085373,0.096314,0.246719
a11,0.063684,0.101594,0.134885,0.003129,0.016602
a12,-0.025175,0.018149,-0.007568,-0.008729,0.331574
a13,-0.045895,0.176139,0.027708,-0.00846,0.096299


In [349]:
class CFRecommender:
    
    MODEL_NAME = 'Collaborative Filtering'
    
    def __init__(self, cf_predictions_df, items_df=None):
        self.cf_predictions_df = cf_predictions_df
        self.items_df = items_df
        
    def get_model_name(self):
        return self.MODEL_NAME
    
    def recommend_items(self, user_id, items_to_ignore=[], topn=10, verbose=False):
        # 사용자에 대한 예측값을 가져와서 정렬한다
        sorted_user_prediction = (self.cf_predictions_df
            .loc[:, user_id]
            .sort_values(ascending=False)
            .reset_index()
            .rename(columns={user_id: 'recStrength'})
        )
        
        recommendations = (sorted_user_prediction
            .loc[lambda d: ~d['contentId'].isin(items_to_ignore)]
            .sort_values('recStrength', ascending=False)
            .head(topn)
        )
        
        if verbose:
            if self.item_df is None:
                raise Exception('"items_df" is required in verbose mode')
            
            recommendations = (recommendations
                .merge(self.items_df, how='left', left_on='contentId', right_on='contentId')
                .loc[:, ['recStrength', 'contentId', 'title', 'url', 'lang']]
            )
            
        return recommendations

In [350]:
cf_recommender_model = CFRecommender(cf_preds_df, articles_df)

In [351]:
print('협업 필터링(SVD 행렬분해) 모형을 평가합니다')
cf_global_metrics, cf_detailed_results = model_evaluator.evaluate_model(cf_recommender_model)
print('Global Metrics:\n{}'.format(cf_global_metrics))
cf_detailed_results.head(10)

# 협업 필터링(SVD 행렬분해) 모형을 평가합니다
# 1139 users processed
# Global Metrics:
# {'model_name': 'Collaborative Filtering', 'recall@5': 0.26553311173612887, 'recall@10': 0.39798005625159805}


협업 필터링(SVD 행렬분해) 모형을 평가합니다


TypeError: not all arguments converted during string formatting

In [352]:
class HybridRecommender:
    
    MODEL_NAME = 'Hybrid'
    
    def __init__(self, cb_rec_model, cf_rec_model, items_df):
        self.cb_rec_model = cb_rec_model
        self.cf_rec_model = cf_rec_model
        self.items_df = items_df
        
    def get_model_name(self):
        return self.MODEL_NAME
    
    def recommend_items(self, user_id, items_to_ignore=[], topn=10, verbose=False):
        # 상위 1000개의 컨텐츠 기반 모형 추천을 가져온다
        cb_recs = (self.cb_rec_model
            .recommend_items(user_id, items_to_ignore=items_to_ignore, verbose=verbose, topn=1000)
            .rename(columns={'recStrength': 'recStrengthCB'})
        )
        
        # 상위 1000개의 협업필터링 추천을 가져온다
        cf_recs = (self.cf_rec_model
            .recommend_items(user_id, items_to_ignore=items_to_ignore, verbose=verbose, topn=1000)
            .rename(columns={'recStrength': 'recStrengthCF'})
        )
        
        # 1) 두 모형의 결과를 합친다
        # 2) CF, CB 모형의 점수를 바탕으로 Hybrid 모형의 점수를 계산한다
        # 3) Hybrid 점수를 기준으로 정렬한다
        recommendations = (cb_recs
            .merge(cf_recs, how='inner', left_on='contentId', right_on='contentId')
            .assign(recStrengthHybrid = lambda d: d['recStrengthCB'] * d['recStrengthCF'])
            .sort_values('recStrengthHybrid', ascending=False)
            .head(topn)
        )
        
        if verbose:
            if self.items_df is None:
                raise Exception('"items_df" is required in verbose mode')
            
            recommendations = (recommendations
                .merge(self.items_df, how='left', left_on='contentId', right_on='contentId')
                .loc[:, ['recStrengthHybrid', 'contentId', 'title', 'url', 'lang']]
            )
        
        return recommendations


In [353]:
hybrid_recommender_model = HybridRecommender(content_based_model, cf_recommender_model, articles_df)


In [311]:
print('하이브리드 모형을 평가합니다')
hybrid_global_metrics, hybrid_detailed_results = model_evaluator.evaluate_model(hybrid_recommender_model)
print('Global Metrics:\n{}'.format(hybrid_global_metrics))
hybrid_detailed_results.head(10)

# 하이브리드 모형을 평가합니다
# 1139 users processed
# Global Metrics:
# {'model_name': 'Hybrid', 'recall@5': 0.38340577857325492, 'recall@10': 0.49757095372027615}


하이브리드 모형을 평가합니다


TypeError: not all arguments converted during string formatting