# Compute the Steam 5-core

This notebook computes the 5-core for the Steam data set, so we have pruned data.

In [2]:
core_k = 5

In [1]:
import pandas as pd

## Load the Data

First we will load the import data:

In [20]:
steam_ui = pd.read_csv('data/steam-video-game/steam-video-game.csv')
steam_ui.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5094082 entries, 0 to 5094081
Data columns (total 2 columns):
user_id    object
item_id    int64
dtypes: int64(1), object(1)
memory usage: 77.7+ MB


In [21]:
user_counts = steam_ui.groupby('user_id')['item_id'].agg(['count'])
user_counts.describe()

Unnamed: 0,count
count,70912.0
mean,71.836671
std,132.366763
min,1.0
25%,14.0
50%,40.0
75%,87.0
max,7762.0


In [22]:
item_counts = steam_ui.groupby('item_id')['user_id'].agg(['count'])
item_counts.describe()

Unnamed: 0,count
count,10978.0
mean,464.026416
std,1793.593696
min,1.0
25%,9.0
50%,43.0
75%,220.0
max,49136.0


In [23]:
len(steam_ui)

5094082

## Compute the 5-core

In [24]:
while item_counts['count'].min() < core_k or user_counts['count'].min() < core_k:
    print('pruning users (keeping {} of {})'.format((user_counts['count'] >= core_k).sum(), len(user_counts)))
    steam_j = steam_ui.join(user_counts, on='user_id')
    steam_ui = steam_j.loc[steam_j['count'] >= core_k, ['user_id', 'item_id']]
    
    item_counts = steam_ui.groupby('item_id')['user_id'].agg(['count'])
    print('pruning items (keeping {} of {})'.format((item_counts['count'] >= core_k).sum(), len(item_counts)))
    steam_j = steam_ui.join(item_counts, on='item_id')
    steam_ui = steam_j.loc[steam_j['count'] >= core_k, ['user_id', 'item_id']]
    
    user_counts = steam_ui.groupby('user_id')['item_id'].agg(['count'])
    item_counts = item_counts[item_counts['count'] >= core_k]

pruning users (keeping 62936 of 70912)
pruning items (keeping 9192 of 10977)


In [25]:
item_counts.describe()

Unnamed: 0,count
count,9192.0
mean,551.94158
std,1924.6755
min,5.0
25%,21.0
50%,71.0
75%,300.0
max,45261.0


In [26]:
user_counts.describe()

Unnamed: 0,count
count,62936.0
mean,80.612797
std,137.019783
min,5.0
25%,21.0
50%,48.0
75%,96.0
max,7334.0


How many records are left?

In [27]:
len(steam_ui)

5073447

In [28]:
steam_ui.to_csv('data/steam-video-game/steam-pruned.csv', index=False)