## Load the artur steam dataset

This can be skipped if you want to expand the existing bloom dataset without the old userids.

In [1]:
from pandas import DataFrame
import pickle


with open("data/full_user_data.pkl", "rb") as dataPickle:
    data: DataFrame = pickle.load(dataPickle, fix_imports=True)

## Augmenting the dataset

This block makes it simple to collect additional data without the need for constant supervision. Re-run as many times as needed to collect a ridiculous amount of data.

First the augmented dataset is loaded if possible

In [2]:
from collections import deque

apiKey = "5C9F6A3AB40F423F4BF70D5C055749FA"

try:
    with open("data/bloomData.pkl", "rb") as dataPickle:
        data2: DataFrame = pickle.load(dataPickle, fix_imports=True)
except:
    data2 = DataFrame()


All users from Artur's dataset not yet in the augmented dataset is added to a processing queue. And each user's friend is added to the queue if it is not processed yet

In [3]:
processedIDs = set[str]([id for id in data2.index])

unprocessedFriends \
    = set([i for i in data.index if i not in processedIDs]) |\
    set[str]([f["steamid"] for list in data2["friendsList"] for f in list if f["steamid"] not in processedIDs])

processingQueue = deque[str](unprocessedFriends)

bloomsteamID = "76561198126594781" # Submit myself into the dataset because it might be interesting
if(bloomsteamID not in processingQueue):
    processingQueue.appendleft(bloomsteamID)

batchesDone = 0


We query the steam web api to collect the data and append them into the bloom dataset.

In [4]:
import pandas
from gather_user_data import (
    get_friend_list,
    get_player_bans,
    get_player_summaries,
    get_recent_playtime,
    get_user_group_list,
    get_owned_games
)

while len(processingQueue) > 0:
    processingBatches = list[set[str]]()
    while len(processingQueue) > 0:
        batch = set[str]()

        while len(batch) < 100 and len(processingQueue) > 0:
            nextItem = processingQueue.popleft()
            if nextItem in processedIDs or not isinstance(nextItem, str):
                continue

            batch.add(nextItem)
            processedIDs.add(nextItem)

        if len(batch) > 0:
            processingBatches.append(batch)

    for batch in processingBatches:
        batchIDStr = ",".join([ID for ID in batch])

        summaries = get_player_summaries(apiKey, batchIDStr)
        banInfo = get_player_bans(apiKey, batchIDStr)

        summaries = {s["steamid"]: s for s in summaries}

        # Combine ban info into same dict
        for b in banInfo:
            steamID = b["SteamId"]
            if(steamID not in summaries):
                print(f"ID {steamID} somehow not in fetched summaries... skipping")
                batch.remove(steamID)
                continue

            summaries[steamID] = summaries[steamID] | b

        for steamid in batch:
            friends = get_friend_list(apiKey, steamid, "all")
            ownedGames = get_owned_games(apiKey, steamid)
            userGroups = get_user_group_list(apiKey, steamid)
            recentlyPlayed = get_recent_playtime(apiKey, steamid)

            summaries[steamid]["friendsList"] = friends
            summaries[steamid]["ownedGames"] = ownedGames
            summaries[steamid]["groups"] = userGroups
            summaries[steamid]["recentlyPlayed"] = recentlyPlayed

            for f in [f for f in friends if f["steamid"] not in processedIDs]:
                processingQueue.append(f["steamid"])


        df = DataFrame.from_dict(summaries, orient="index")

        data2: DataFrame = pandas.concat([data2, df]) # type: ignore
        with open("data/bloomData.pkl", "wb") as blmdataPickle:
            pickle.dump(data2, blmdataPickle)
        
        batchesDone += 1

ID 76561197991670176 somehow not in fetched summaries... skipping
ID 76561198950503479 somehow not in fetched summaries... skipping


JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [None]:
with open("data/bloomData.pkl", "wb") as blmdataPickle:
    pickle.dump(data2, blmdataPickle)

In [None]:
"76561199016387141" in data2.index

True

In [None]:
data2.loc["76561199016387141"]

steamid                                                     76561199016387141
communityvisibilitystate                                                    3
personaname                                                           leandro
profileurl                  https://steamcommunity.com/profiles/7656119901...
avatar                      https://avatars.steamstatic.com/fef49e7fa7e199...
avatarmedium                https://avatars.steamstatic.com/fef49e7fa7e199...
avatarfull                  https://avatars.steamstatic.com/fef49e7fa7e199...
avatarhash                           fef49e7fa7e1997310d705b2a6158ff8dc1cdfeb
personastate                                                                0
primaryclanid                                              103582791429521408
timecreated                                                      1578246211.0
personastateflags                                                         0.0
SteamId                                                     7656