In [1]:
import pandas as pd
import numpy as np
import random

In [2]:
path = 'C:/Users/18280/Desktop/RS/Recommended_System-master/L2/steam_video_games/steam-200K.csv'
df = pd.read_csv(path,header = None,names = ['UserID','Game','Action','Hours','Not Needed'])
df.head()

Unnamed: 0,UserID,Game,Action,Hours,Not Needed
0,151603712,The Elder Scrolls V Skyrim,purchase,1.0,0
1,151603712,The Elder Scrolls V Skyrim,play,273.0,0
2,151603712,Fallout 4,purchase,1.0,0
3,151603712,Fallout 4,play,87.0,0
4,151603712,Spore,purchase,1.0,0


In [3]:
print('显示数据大小')
print(df.shape)

显示数据大小
(200000, 5)


In [4]:
# 创建Hours_Played字段，替代原有的Action和Hours，0表示仅购买，大于0表示购买且游戏时长
df['Hours_Played'] = df['Hours'].astype('float32')
# 如果字段Action=purchase，并且Hours=1.0，将设置Hours_Played=0
df.loc[(df['Action']=='purchase')&(df['Hours']==1),'Hours_Played'] = 0
print(df['Hours_Played'][0:3])
print(df.shape)

0      0.0
1    273.0
2      0.0
Name: Hours_Played, dtype: float32
(200000, 6)


In [8]:
# 对数据从小到大进行排序, df下标也会发生变化
df['UserID'] = df['UserID'].astype('int')
df  = df.sort_values(['UserID','Game','Hours_Played'],ascending = True)
df.head()

Unnamed: 0,UserID,Game,Action,Hours,Not Needed,Hours_Played
65429,5250,Alien Swarm,purchase,1.0,0,0.0
65430,5250,Alien Swarm,play,4.9,0,4.9
65423,5250,Cities Skylines,purchase,1.0,0,0.0
65424,5250,Cities Skylines,play,144.0,0,144.0
65435,5250,Counter-Strike,purchase,1.0,0,0.0


In [9]:
# 删除重复项，并保留最后一项出现的项（因为最后一项是用户游戏时间，第一项为购买）
clean_df = df.drop_duplicates(['UserID','Game'],keep = 'last')
# 去掉不用的列：Action, Hours, Not Needed
clean_df = clean_df.drop(['Action','Hours','Not Needed'],axis=1)
print('删除重复项后的数据集为：')
print(clean_df)
print(clean_df.head(0))

删除重复项后的数据集为：
           UserID                                        Game  Hours_Played
65430        5250                                 Alien Swarm           4.9
65424        5250                             Cities Skylines         144.0
65435        5250                              Counter-Strike           0.0
65436        5250                       Counter-Strike Source           0.0
65437        5250                               Day of Defeat           0.0
65438        5250                          Deathmatch Classic           0.0
65426        5250                    Deus Ex Human Revolution          62.0
65434        5250                                      Dota 2           0.2
65439        5250                                   Half-Life           0.0
65440        5250                                 Half-Life 2           0.0
65441        5250                      Half-Life 2 Deathmatch           0.0
65442        5250                     Half-Life 2 Episode One           0.0

In [10]:
clean_df.head(0)

Unnamed: 0,UserID,Game,Hours_Played


In [16]:
n_user = len(clean_df['UserID'].unique())
n_games = len(clean_df['Game'].unique())
print('用户数量:%d,游戏个数为:%d'%(n_user,n_games))
# print('数据集中包含了 {0} 玩家，{1} 游戏'.format(n_users, n_games))

用户数量:12393,游戏个数为:5155


In [17]:
# 矩阵的稀疏性
clean_df.shape[0]

128804

In [19]:
sparsity = clean_df.shape[0] / float(n_user * n_games)
print('用户行为矩阵的稀疏性（填充比例）为{:.2%} '.format(sparsity))

用户行为矩阵的稀疏性（填充比例）为0.20% 
