In [1]:
import pandas as pd
import math
from sklearn.model_selection import train_test_split
import random
import time
from operator import itemgetter
from gensim.models import word2vec

## 加载数据


In [2]:
data = pd.read_csv('ml-1m/ratings.dat',sep='::')
data.columns=['user_id','movie_id','rating','timestamp']
data['datetime'] = pd.to_datetime(data['timestamp'].apply(lambda x:time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(x/1000))))
data.head()

  """Entry point for launching an IPython kernel.


Unnamed: 0,user_id,movie_id,rating,timestamp,datetime
0,1,661,3,978302109,1970-01-12 15:45:02
1,1,914,3,978301968,1970-01-12 15:45:01
2,1,3408,4,978300275,1970-01-12 15:45:00
3,1,2355,5,978824291,1970-01-12 15:53:44
4,1,1197,3,978302268,1970-01-12 15:45:02


In [3]:
data['month'] = data['datetime'].dt.month
data['day'] = data['datetime'].dt.day
data['hour'] = data['datetime'].dt.hour
data['minute'] = data['datetime'].dt.minute
data['second'] = data['datetime'].dt.second
del data['timestamp']
data.head()

Unnamed: 0,user_id,movie_id,rating,datetime,month,day,hour,minute,second
0,1,661,3,1970-01-12 15:45:02,1,12,15,45,2
1,1,914,3,1970-01-12 15:45:01,1,12,15,45,1
2,1,3408,4,1970-01-12 15:45:00,1,12,15,45,0
3,1,2355,5,1970-01-12 15:53:44,1,12,15,53,44
4,1,1197,3,1970-01-12 15:45:02,1,12,15,45,2


In [4]:
data = data.sort_values(by=['user_id','datetime'])
data.head()

Unnamed: 0,user_id,movie_id,rating,datetime,month,day,hour,minute,second
2,1,3408,4,1970-01-12 15:45:00,1,12,15,45,0
6,1,2804,5,1970-01-12 15:45:00,1,12,15,45,0
20,1,720,3,1970-01-12 15:45:00,1,12,15,45,0
21,1,1270,5,1970-01-12 15:45:00,1,12,15,45,0
23,1,2340,3,1970-01-12 15:45:00,1,12,15,45,0


## 分割测试集和训练集
每个用户最后购买的5个商品划分到测试集

In [5]:
train_df = pd.DataFrame()
test_df = pd.DataFrame()
for user, df in data.groupby('user_id'):
    train_df = pd.concat([train_df,df.head(len(df)-5)],ignore_index=True)
    test_df = pd.concat([test_df,df.tail(5)], ignore_index=True)   
train_df.head()
test_df.head()

Unnamed: 0,user_id,movie_id,rating,datetime,month,day,hour,minute,second
0,1,1566,4,1970-01-12 15:53:44,1,12,15,53,44
1,1,588,4,1970-01-12 15:53:44,1,12,15,53,44
2,1,1907,4,1970-01-12 15:53:44,1,12,15,53,44
3,1,783,4,1970-01-12 15:53:44,1,12,15,53,44
4,1,1,5,1970-01-12 15:53:44,1,12,15,53,44


In [6]:
train_df["movie_id"] = train_df["movie_id"].astype('str')
test_df["movie_id"] = test_df["movie_id"].astype('str')

In [7]:
def df2_item_dict(df):
    df_group = df.groupby("user_id").agg({"movie_id": lambda x: list(x.unique())})
    df_group.columns = ["movie_list"]
    data_dict = df_group.to_dict(orient='index')
    data_dict = {k: v["movie_list"] for k, v in data_dict.items()}
    return data_dict

In [8]:
train_dict = df2_item_dict(train_df)
test_dict = df2_item_dict(test_df)

In [9]:
texts = list(train_dict.values())
texts[:1]

[['3408',
  '2804',
  '720',
  '1270',
  '2340',
  '1721',
  '3186',
  '1836',
  '1022',
  '260',
  '1207',
  '914',
  '919',
  '938',
  '1035',
  '2018',
  '3105',
  '1097',
  '150',
  '1961',
  '1962',
  '2692',
  '1028',
  '2028',
  '608',
  '661',
  '1197',
  '1287',
  '594',
  '2398',
  '2918',
  '2791',
  '2797',
  '2321',
  '2762',
  '1029',
  '531',
  '3114',
  '1246',
  '2355',
  '595',
  '2687',
  '527',
  '48',
  '1545',
  '745',
  '2294']]

## 训练数据

In [10]:
model = word2vec.Word2Vec(texts,min_count=1, workers=4,size = 64)

In [11]:
model.most_similar('745',topn=5)

  """Entry point for launching an IPython kernel.


[('1280', 0.8529601097106934),
 ('1411', 0.8438481688499451),
 ('1288', 0.825940728187561),
 ('1277', 0.8251876831054688),
 ('741', 0.8242132663726807)]

In [15]:
from operator import itemgetter
def GetRecommendation(user,n):
    seen = train_dict[user]
    recom_list = []
    for item in seen:
        if item not in model:
            continue
        recom_list.extend(model.most_similar(item, topn=n))
    recom_list = [item for item in recom_list if item[0] not in seen]
    recom_list = sorted(recom_list,key=itemgetter(1), reverse=True)
    recom = recom_list[:5]
    return recom

## 测评指标
1. Recall
2. Precision
3. Coverage
4. Popularity

In [13]:
class Eval():
    def __init__(self, train, test, GetRecommendation,N):
        self.train = train
        self.test = test
        self.GetRecommendation = GetRecommendation
        self.N =N

    def Recall(self):
        hit = 0
        all = 0
        for user,items in self.test.items():
            tu = items
            rank = self.GetRecommendation(user, self.N)
            for item,pui in rank:
                if item in tu:
                    hit += 1
            all += len(tu)
        return round(hit / (all * 1.0), 2)

    def Precision(self):
        hit = 0
        all = 0
        for user in self.test.keys():
            tu = set(self.test[user])
            rank = self.GetRecommendation(user, self.N)
            for item,pui in rank:
                if item in tu:
                    hit += 1
            all += self.N
        return round(hit / (all * 1.0), 2)
    
    def Coverage(self):
        recommend_items = set()
        all_items = set()
        for user in self.test.keys():
            for item in self.train[user]:
                all_items.add(item)
            rank = self.GetRecommendation(user, self.N)
            for item,pui in rank:
                recommend_items.add(item)
        return round(len(recommend_items) / (len(all_items) * 1.0),2)

    def Popularity(self):
        item_pop = dict()
        for user, items in self.train.items():
            for item in items:
                if item not in item_pop:
                    item_pop[item] = 0
                else:
                    item_pop[item] += 1
        ret = 0
        n = 0
        for user in self.test.keys():
            rank =self. GetRecommendation(user, self.N)
            for item,pui in rank:
                ret += math.log(1 + item_pop[item])
                n += 1
        ret /= n * 1.0
        return round(ret,2)

    def eval(self):
        metric = {'Precision': self.Precision(),
                  'Recall': self.Recall(),
                  'Coverage': self.Coverage(),
                  'Popularity': self.Popularity()}
        print('Metric:', metric)
        return metric

## 实验 

In [None]:
N=5
eval_ = Eval(train_dict,test_dict,GetRecommendation,N)
metric = eval_.eval()

  
  
