In [2]:
import pandas as pd
import numpy as np

In [3]:
social_data = pd.read_csv('Gowalla_edges.txt', sep="\t", header=None, names=['u', 'v'])
social_data

Unnamed: 0,u,v
0,0,1
1,0,2
2,0,3
3,0,4
4,0,5
...,...,...
1900649,196586,196539
1900650,196587,196540
1900651,196588,196540
1900652,196589,196547


In [4]:
social_data['u'].nunique()

196591

In [5]:
# 描述性统计
social_data.describe(include='all')

Unnamed: 0,u,v
count,1900654.0,1900654.0
mean,51007.74,51007.74
std,50105.27,50105.27
min,0.0,0.0
25%,7399.0,7399.0
50%,37340.0,37340.0
75%,82120.0,82120.0
max,196590.0,196590.0


In [6]:
#用户打卡数据
check_data = pd.read_csv('Gowalla_totalCheckins.txt', sep='\t', header=None
                         ,names=['u','time','x','y','p'])
check_data.head()

Unnamed: 0,u,time,x,y,p
0,0,2010-10-19T23:55:27Z,30.235909,-97.79514,22847
1,0,2010-10-18T22:17:43Z,30.269103,-97.749395,420315
2,0,2010-10-17T23:42:03Z,30.255731,-97.763386,316637
3,0,2010-10-17T19:26:05Z,30.263418,-97.757597,16516
4,0,2010-10-16T18:50:42Z,30.274292,-97.740523,5535878


In [7]:
#打卡数据描述性统计
check_data.describe(include='all')

Unnamed: 0,u,time,x,y,p
count,6442892.0,6442892,6442892.0,6442892.0,6442892.0
unique,,5561957,,,
top,,2010-10-08T17:50:58Z,,,
freq,,8,,,
mean,60436.42,,40.52177,-47.44338,725716.1
std,54275.04,,14.76714,66.3613,950135.9
min,0.0,,-90.0,-176.3086,8904.0
25%,11048.0,,33.40766,-97.67548,112533.0
50%,42295.0,,39.88993,-78.06955,424940.5
75%,107157.0,,51.25089,11.13241,977545.0


In [8]:
#社交数据重复情况
social_data.duplicated().sum()

0

In [9]:
#打卡数据重复情况
check_data.duplicated().sum()

601

In [10]:
#删除重复值
check_data.drop_duplicates(inplace=True)

In [11]:
#打卡数据重复情况
check_data.duplicated().sum()

0

In [12]:
#time由文本转换为时间类型
from datetime import datetime

check_data['time'] = check_data['time'].apply(
    lambda x: datetime.strptime(x[:10], '%Y-%m-%d')
)
check_data

Unnamed: 0,u,time,x,y,p
0,0,2010-10-19,30.235909,-97.795140,22847
1,0,2010-10-18,30.269103,-97.749395,420315
2,0,2010-10-17,30.255731,-97.763386,316637
3,0,2010-10-17,30.263418,-97.757597,16516
4,0,2010-10-16,30.274292,-97.740523,5535878
...,...,...,...,...,...
6442887,196578,2010-06-11,51.742988,-0.488065,906885
6442888,196578,2010-06-11,51.746492,-0.490780,965121
6442889,196578,2010-06-11,51.741916,-0.496729,1174322
6442890,196585,2010-10-08,50.105516,8.571525,471724


保存处理后的打卡数据

In [13]:
#通过适用pickle可以将对象转换为字节流，可以保持对象的完整性和跨平台兼容性，性能也很高效
#保存处理好的对象
import pickle
with open('check_data.pkl','wb') as f:
    pickle.dump(check_data,f)

In [14]:
with open('check_data.pkl','rb') as f:
    check_data=pickle.load(f)

In [15]:
check_data

Unnamed: 0,u,time,x,y,p
0,0,2010-10-19,30.235909,-97.795140,22847
1,0,2010-10-18,30.269103,-97.749395,420315
2,0,2010-10-17,30.255731,-97.763386,316637
3,0,2010-10-17,30.263418,-97.757597,16516
4,0,2010-10-16,30.274292,-97.740523,5535878
...,...,...,...,...,...
6442887,196578,2010-06-11,51.742988,-0.488065,906885
6442888,196578,2010-06-11,51.746492,-0.490780,965121
6442889,196578,2010-06-11,51.741916,-0.496729,1174322
6442890,196585,2010-10-08,50.105516,8.571525,471724


## 数据建模与可视化
1、时间上下文相关的UserCF算法


1：找到目标用户u的k个最相似用户

In [16]:
#定义用户之间相似度计算方式
import math

def user_similarity_social(u, v):
    #用户u、v的好友集合
    data_u = set(social_data.query('u == @u')['v'])
    data_v = set(social_data.query('u == @v')['v'])

    return len(data_u & data_v) / math.sqrt(len(data_u) * len(data_v))

In [17]:
#用户u的K个相似用户
def get_uk(u, K=10):

    #用户y的打卡地点集合
    u_p = set(check_data.query('u == @u')['p'])
    #跟用户u在同一地点打过卡的用户集合
    p_v = set(check_data.query('p in @u_p')['u'])

    #计算相似度
    w_df = pd.DataFrame(columns=['w'])
    for v in p_v:
        w_df.loc[v] = user_similarity_social(u, v)
    #排序取前K个，排除用户u
    return w_df['w'].sort_values(ascending=False)[1:K+1]

In [18]:
%%time
#测试
uk = get_uk(2)
uk

CPU times: total: 4min 26s
Wall time: 2min 52s


1779    0.215577
2039    0.203101
1174    0.197672
742     0.175281
590     0.171830
2292    0.170600
111     0.169438
1060    0.164228
1517    0.162230
1279    0.159814
Name: w, dtype: float64

In [19]:
uk

1779    0.215577
2039    0.203101
1174    0.197672
742     0.175281
590     0.171830
2292    0.170600
111     0.169438
1060    0.164228
1517    0.162230
1279    0.159814
Name: w, dtype: float64

In [25]:
from tqdm import tqdm

#为用户u推荐n个地点

def recommend_u(u, n=10, K=10):
    """
    """
    """
    part1 获取初步推荐列表
    """
    #K个相似用户的打卡地点
    uk = get_uk(u, K)
    v_p = set(check_data.query('u in @uk.index')['p'])
    #排除用户u打过卡的地点
    p_set = v_p - set(check_data.query('u == @u')['p'])
    """
    part2 对推荐列表排序，得到最终推荐列表
    """
    #将t0设置为2010年11月1日
    t0 = datetime.strptime('2010-11-01','%Y-%m-%d')
    #设置两个索引列，用来标记日期？
    vi_time = check_data.set_index(['u','p'])['time']

    #基于用户相似度和打卡时间计算用户u对地点的兴趣
    df = pd.DataFrame(columns=['interest'])
    alpha = 0.01

    #遍历打卡地点
    for i in tqdm(p_set):
        interest = 0
        #遍历相似用户
        for v in uk.index:
            w = uk.loc[v]
            try:
                #用户v在打卡地点i的所有打卡时间
                t_list = pd.Series(vi_time[(v,i)])
                #遍历打卡时间
                for t in t_list:
                    #预测用户对地点的兴趣度公式
                    interest += w / (1 + alpha * ((t0-t).days))
            except Exception as e:
                pass
        df.loc[i] = interest

    return df['interest'].sort_values(ascending=False)[:n]

用户兴趣度公式$$\large p(u,i)=\sum_{v\in S(u,k)}w_{uv}r_{vi}\frac{1}{1+\alpha(t_0-t_{vi})}$$

In [21]:
from tqdm import tqdm
import time

# for i in tqdm(range(10)):
#     time.sleep(1)
pbar = tqdm(list(range(10)))
for i in pbar:
    time.sleep(1)
    pbar.set_description('处理进度')

处理进度: 100%|██████████| 10/10 [00:10<00:00,  1.01s/it]


In [22]:
social_data['u']

0               0
1               0
2               0
3               0
4               0
            ...  
1900649    196586
1900650    196587
1900651    196588
1900652    196589
1900653    196590
Name: u, Length: 1900654, dtype: int64

In [26]:
%%time
recommend_u(2,10,10)

  t_list = pd.Series(vi_time[(v,i)])
  t_list = pd.Series(vi_time[(v,i)])
  t_list = pd.Series(vi_time[(v,i)])
  t_list = pd.Series(vi_time[(v,i)])
  t_list = pd.Series(vi_time[(v,i)])
  t_list = pd.Series(vi_time[(v,i)])
  t_list = pd.Series(vi_time[(v,i)])
  t_list = pd.Series(vi_time[(v,i)])
  t_list = pd.Series(vi_time[(v,i)])
  t_list = pd.Series(vi_time[(v,i)])
  t_list = pd.Series(vi_time[(v,i)])
  t_list = pd.Series(vi_time[(v,i)])
  t_list = pd.Series(vi_time[(v,i)])
  t_list = pd.Series(vi_time[(v,i)])
  t_list = pd.Series(vi_time[(v,i)])
  t_list = pd.Series(vi_time[(v,i)])
  t_list = pd.Series(vi_time[(v,i)])
  t_list = pd.Series(vi_time[(v,i)])
  t_list = pd.Series(vi_time[(v,i)])
  t_list = pd.Series(vi_time[(v,i)])
  t_list = pd.Series(vi_time[(v,i)])
  t_list = pd.Series(vi_time[(v,i)])
  t_list = pd.Series(vi_time[(v,i)])
  t_list = pd.Series(vi_time[(v,i)])
  t_list = pd.Series(vi_time[(v,i)])
  t_list = pd.Series(vi_time[(v,i)])
  t_list = pd.Series(vi_time[(v,i)])
 

CPU times: total: 5min 37s
Wall time: 3min 49s





768391    2.371898
260957    1.870215
864730    1.851068
132310    1.798772
563409    1.761171
510965    1.597069
55566     1.397757
19542     1.389170
23261     1.209592
194347    1.167814
Name: interest, dtype: float64

2.基于图的方法

In [28]:
# 新建用户打卡的小数据集
data = pd.DataFrame(columns=['u','p'])
data['u'] = [0,0,0,1,1,1,2,2,3,3,3,4,4]
data['p'] = [1,2,4,1,2,3,3,5,1,4,5,2,6]
data


Unnamed: 0,u,p
0,0,1
1,0,2
2,0,4
3,1,1
4,1,2
5,1,3
6,2,3
7,2,5
8,3,1
9,3,4


In [29]:
# 为用户u和地点p的id添加标识
data['u'] = data['u'].apply(lambda x: 'u' + str(x))
data['p'] = data['p'].apply(lambda x: 'p' + str(x))
data


Unnamed: 0,u,p
0,u0,p1
1,u0,p2
2,u0,p4
3,u1,p1
4,u1,p2
5,u1,p3
6,u2,p3
7,u2,p5
8,u3,p1
9,u3,p4


In [32]:
import tensorflow

ModuleNotFoundError: No module named 'tensorflow'