In [1]:
import numpy as np
import scipy
import pandas as pd
import math
import random
import sklearn
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse.linalg import svds
import matplotlib.pyplot as plt

In [2]:

UserID =-9220000000000000000

In [3]:
articles_df = pd.read_csv('C:/Users/Administrator/teaable2500.csv')

articles_df

Unnamed: 0,차품목,효능번호,카페인,향기준,contentId,"맛,향",personId
0,Bengal Spice,1,X,2,100001,"계피, 생강",1
1,BerryBlossom White,1,O,1,100002,과일향(베리향),2
2,Black Cherry Berry,3,X,1,100003,"과일향(체리, 베리)",3
3,Breakfast in Paris Black Tea,1,O,14,100004,"라벤더꽃향, 시트러스향",4
4,ButterScotch Blondie,3,O,4,100005,"버터카라멜향, 바닐라향",5
...,...,...,...,...,...,...,...
2495,aa2496,4,O,3,102496,190aa19,2496
2496,aa2497,4,O,4,102497,401aa71,2497
2497,aa2498,3,O,1,102498,262aa31,2498
2498,aa2499,2,X,4,102499,27aa27,2499


In [4]:
interaction_df = pd.read_csv('C:/Users/Administrator/userinteract7.csv')
interaction_df.head()

Unnamed: 0.1,Unnamed: 0,timestamp,eventType,contentId,personId,sessionId,userAgent,userRegion,userCountry,eventStrength,interactCnt,gender,age(15~60)
0,866,1463138398,VIEW,102336,-9.22e+18,-4.48e+18,Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKi...,SP,BR,1.0,43,male,18
1,1475,1463656314,VIEW,102124,-9.22e+18,-7.82e+18,Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKi...,SP,BR,1.0,43,male,18
2,2916,1464113091,VIEW,101361,-9.22e+18,-2.77e+18,Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKi...,SP,BR,1.0,43,male,18
3,5309,1462283851,VIEW,100580,-9.22e+18,2.63e+18,Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKi...,SP,BR,1.0,43,male,18
4,9221,1462452127,VIEW,102394,-9.22e+18,-3.67e+18,Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKi...,SP,BR,1.0,43,male,18


In [5]:
event_type_strength = {
    'VIEW': 1.0,
    'JJIM': 3.0, 
    'comment1' : 1.0,
    'comment2' : 2.0,
    'comment3' : 3.0,
    'comment4' : 4.0,
    'comment5' : 5.0
  
}

interaction_df['eventStrength'] = (
  interaction_df
    .loc[:, 'eventType']
    .apply(lambda d: event_type_strength[d])
)

In [6]:
interaction_df_over5 = (interaction_df
  .groupby('personId', group_keys=False)
  .apply(lambda df: df.assign(interactCnt = lambda d: d['contentId'].nunique()))
  .loc[lambda d: d['interactCnt'] >= 5]
)

In [7]:
interaction_full_df = (
  interaction_df_over5
    .groupby(['personId', 'contentId'], as_index=False)['eventStrength']
    .sum()
    .assign(eventScore = lambda d: np.log2(1+d['eventStrength']))
)

interaction_full_df.head(10)

Unnamed: 0,personId,contentId,eventStrength,eventScore
0,-9.22e+18,100050,1.0,1.0
1,-9.22e+18,100205,1.0,1.0
2,-9.22e+18,100214,1.0,1.0
3,-9.22e+18,100332,1.0,1.0
4,-9.22e+18,100380,1.0,1.0
5,-9.22e+18,100416,1.0,1.0
6,-9.22e+18,100427,1.0,1.0
7,-9.22e+18,100433,1.0,1.0
8,-9.22e+18,100487,1.0,1.0
9,-9.22e+18,100498,1.0,1.0


In [8]:
interaction_train, interaction_test = train_test_split(
    interaction_full_df,
    stratify=interaction_full_df['personId'],
    test_size=0.2,
    random_state=42
)

In [9]:
# 평가 속도를 높이기 위해 personId를 기준으로 인덱스를 설정한다
interaction_full_indexed = interaction_full_df.set_index('personId')
interaction_train_indexed = interaction_train.set_index('personId')
interaction_test_indexed = interaction_test.set_index('personId')

In [10]:
def get_items_interacted(person_id, interaction_df):
    interated_items = interaction_df.loc[person_id]['contentId']
    return set(interated_items if type(interated_items) == pd.Series else [interated_items])

In [11]:
class ModelEvaluator:
    def __init__(self, n_non_interacted=100):
        self.EVAL_RANDOM_SAMPLE_NON_INTERACTED_ITEMS = n_non_interacted
        
    def get_non_interacted_items_sample(self, person_id, sample_size, seed=42):
        interacted_items = get_items_interacted(person_id, interaction_full_indexed)
        all_items = set(articles_df['contentId'])
        non_interacted_items = all_items - interacted_items
        
        random.seed(seed)
        non_interacted_items_sample = random.sample(non_interacted_items, sample_size)
        return set(non_interacted_items_sample)
        
    def _verify_hit_top_n(self, item_id, recommend_items, topn):
        try:
            index = next(i for i, c in enumerate(recommend_items) if c == item_id)
        except:
            index = -1
        hit = int(index in range(0, topn))
        return hit, index
    
    def evaluate_model_for_user(self, model, person_id):
        interacted_values_testset = interaction_test_indexed.loc[person_id]
        if type(interacted_values_testset['contentId']) == pd.Series:
            person_interacted_items_testset = set(interacted_values_testset['contentId'])
        else:
            person_interacted_items_testset = set([int(interacted_values_testset['contentId'])])
        
        interacted_items_count_testset = len(person_interacted_items_testset)
        
        # 특정 사용자에 대한 추천 순위 목록을 받아온다
        person_recs = model.recommend_items(
            person_id,
            items_to_ignore=get_items_interacted(person_id, interaction_train_indexed),
            topn=10000000000
        )
        
        hits_at_5_count = 0
        hits_at_10_count = 0
        
        # test셋에서 사용자가 상호작용한 모든 항목에 대해 반복한다
        for item_id in person_interacted_items_testset:
            
            # 사용자가 상호작용하지 않은 100개 항목을 샘플링한다
            non_interacted_items_sample = self.get_non_interacted_items_sample(
                person_id,
                sample_size=self.EVAL_RANDOM_SAMPLE_NON_INTERACTED_ITEMS,
                seed=item_id % (2**32)
            )
            
            # 현재 선택한 item_id(상호작용 있었던 항목)와 100개 랜덤 샘플을 합친다
            items_to_filter_recs = non_interacted_items_sample.union(set([item_id]))
            
            # 추천 결과물 중에서 현재 선택한 item_id와 100개 랜덤 샘플의 결과물로만 필터링한다
            valid_recs_df = person_recs[person_recs['contentId'].isin(items_to_filter_recs)]
            valid_recs = valid_recs_df['contentId'].values
            
            # 현재 선택한 item_id가 Top-N 추천 결과 안에 있는지 확인한다
            hit_at_5, index_at_5 = self._verify_hit_top_n(item_id, valid_recs, 5)
            hits_at_5_count += hit_at_5
            hit_at_10, index_at_10 = self._verify_hit_top_n(item_id, valid_recs, 10)
            hits_at_10_count += hit_at_10
            
        # Recall 값은 상호작용 있었던 항목들 중에서 관련없는 항목들과 섞였을 때 Top-N에 오른 항목들의 비율로 나타낼 수 있다
        recall_at_5 = hits_at_5_count / interacted_items_count_testset
        recall_at_10 = hits_at_10_count / interacted_items_count_testset
        
        person_metrics = {
            'hits@5_count': hits_at_5_count,
            'hits@10_count': hits_at_10_count,
            'interacted_count': interacted_items_count_testset,
            'recall@5': recall_at_5,
            'recall@10': recall_at_10
        }
        return person_metrics
    
    def evaluate_model(self, model):
        people_metrics = []
        for idx, person_id in enumerate(list(interaction_test_indexed.index.unique().values)):
            person_metrics = self.evaluate_model_for_user(model, person_id)
            person_metrics['_person_id'] = person_id
            people_metrics.append(person_metrics)

        print('{} users processed'.format(idx))
        
        detailed_result = (
            pd.DataFrame(people_metrics)
              .sort_values('interacted_count', ascending=False)
        )
        
        global_recall_at_5 = detailed_result['hits@5_count'].sum() / detailed_result['interacted_count'].sum()
        global_recall_at_10 = detailed_result['hits@10_count'].sum() / detailed_result['interacted_count'].sum()
        
        global_metrics = {
            'model_name': model.get_model_name(),
            'recall@5': global_recall_at_5,
            'recall@10': global_recall_at_10
        }
        
        return global_metrics, detailed_result
model_evaluator = ModelEvaluator(n_non_interacted=100)

In [12]:
users_items_pivot_df = (interaction_train
  .pivot(index='personId', columns='contentId', values='eventStrength')
  .fillna(0)
)

In [13]:
users_items_pivot_df.iloc[:5, :5]

contentId,100001,100002,100003,100004,100005
personId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
-9.22e+18,0.0,0.0,0.0,0.0,0.0
-9.21e+18,0.0,0.0,0.0,0.0,0.0
-9.2e+18,0.0,0.0,0.0,0.0,0.0
-9.19e+18,0.0,0.0,0.0,0.0,0.0
-9.17e+18,1.0,0.0,0.0,0.0,0.0


In [16]:
users_items_pivot_matrix = users_items_pivot_df.values
users_items_pivot_matrix[:10]


array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [17]:
user_ids = list(users_items_pivot_df.index)
user_ids[:10]

[-9.22e+18,
 -9.21e+18,
 -9.2e+18,
 -9.19e+18,
 -9.17e+18,
 -9.16e+18,
 -9.12e+18,
 -9.11e+18,
 -9.06e+18,
 -9.05e+18]

In [18]:
# User-Item matrix에서 요인의 개수를 정한다
NUMBER_OF_FACTORS_MF = 15

# User-Item Matrix을 분해한다
U, sigma, Vt = svds(users_items_pivot_matrix, k=NUMBER_OF_FACTORS_MF)

In [19]:
U.shape # (1140, 15)
Vt.shape # (15, 2926)

sigma_mat = np.diag(sigma)
sigma_mat.shape # (15, 15)

(15, 15)

In [20]:
all_user_predicted_ratings = np.dot(np.dot(U, sigma_mat), Vt)
all_user_predicted_ratings


array([[ 0.02477427,  0.00602951,  0.02033127, ...,  0.01111308,
         0.00415151,  0.02243625],
       [ 0.03499375,  0.01902586,  0.01857287, ...,  0.00344874,
         0.00241708,  0.01209648],
       [ 0.00275495,  0.01420227,  0.00420251, ...,  0.00770875,
         0.0010616 , -0.00173508],
       ...,
       [ 0.03554231,  0.00952516,  0.04635364, ...,  0.01533133,
         0.01402736,  0.02276479],
       [ 0.02878391,  0.003651  ,  0.0052518 , ...,  0.01254462,
         0.00041618,  0.0049887 ],
       [ 0.00699991,  0.03070677,  0.06336153, ...,  0.08953137,
         0.00939887,  0.07696893]])

In [21]:
# 재구성한 행렬을 pandas DataFrame으로 변환한다
cf_preds_df = (
  pd.DataFrame(all_user_predicted_ratings, 
               columns=users_items_pivot_df.columns, 
               index=user_ids)
    .transpose()
)

In [22]:
cf_preds_df.iloc[:5, :5]

Unnamed: 0_level_0,-9.220000e+18,-9.210000e+18,-9.200000e+18,-9.190000e+18,-9.170000e+18
contentId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
100001,0.024774,0.034994,0.002755,0.006149,0.009665
100002,0.00603,0.019026,0.014202,0.003231,0.006342
100003,0.020331,0.018573,0.004203,0.013099,8.5e-05
100004,0.025484,0.006073,0.023094,-0.001203,0.000308
100005,0.00068,-0.009058,0.010602,-0.001228,0.045042


In [23]:
class CFRecommender:
    
    MODEL_NAME = 'Collaborative Filtering'
    
    def __init__(self, cf_predictions_df, items_df=None):
        self.cf_predictions_df = cf_predictions_df
        self.items_df = items_df
        
    def get_model_name(self):
        return self.MODEL_NAME
    
    def recommend_items(self, user_id, items_to_ignore=[], topn=10, verbose=False):
        # 사용자에 대한 예측값을 가져와서 정렬한다
        sorted_user_prediction = (self.cf_predictions_df
            .loc[:, user_id]
            .sort_values(ascending=False)
            .reset_index()
            .rename(columns={user_id: 'recStrength'})
        )
        
        recommendations = (sorted_user_prediction
            .loc[lambda d: ~d['contentId'].isin(items_to_ignore)]
            .sort_values('recStrength', ascending=False)
            .head(topn)
        )
        
        if verbose:
            if self.item_df is None:
                raise Exception('"items_df" is required in verbose mode')
            
            recommendations = (recommendations
                .merge(self.items_df, how='left', left_on='contentId', right_on='contentId')
                .loc[:, ['recStrength', 'contentId', 'title', 'url', 'lang']]
            )
            
        return recommendations

In [24]:
cf_recommender_model = CFRecommender(cf_preds_df, articles_df)

In [25]:
print('협업 필터링(SVD 행렬분해) 모형을 평가합니다')
cf_global_metrics, cf_detailed_results = model_evaluator.evaluate_model(cf_recommender_model)
print('Global Metrics:\n{}'.format(cf_global_metrics))
cf_detailed_results.head(10)

협업 필터링(SVD 행렬분해) 모형을 평가합니다
865 users processed
Global Metrics:
{'model_name': 'Collaborative Filtering', 'recall@5': 0.04972079859251893, 'recall@10': 0.0983706876768913}


Unnamed: 0,hits@5_count,hits@10_count,interacted_count,recall@5,recall@10,_person_id
137,8,24,264,0.030303,0.090909,-1.03e+18
29,15,26,235,0.06383,0.110638,-1.44e+18
174,17,27,221,0.076923,0.122172,3.61e+18
76,12,19,177,0.067797,0.107345,-2.63e+18
42,5,17,158,0.031646,0.107595,-3.6e+18
18,8,20,149,0.053691,0.134228,-2.98e+18
215,6,10,143,0.041958,0.06993,-7.09e+17
4,8,20,143,0.055944,0.13986,3.3e+18
8,3,9,127,0.023622,0.070866,3.64e+18
85,8,13,123,0.065041,0.105691,1.12e+18
