In [6]:
import pandas as pd
import re
from dateutil import parser
import numpy as np
import pprint
from sentence_transformers import SentenceTransformer
import json
from tqdm import tqdm
import opendatasets as od

# import ast
from dotenv import load_dotenv
import os

load_dotenv()

milvus_uri = os.getenv("MILVUS_URI")
token = os.getenv("MILVUS_TOKEN")
COLLECTION_NAME = os.getenv("COLLECTION_NAME")
PARTITION_NAME = os.getenv("PARTITION_NAME")

In [5]:
import pandas

od.download("https://www.kaggle.com/datasets/fronkongames/steam-games-dataset")

Please provide your Kaggle credentials to download this dataset. Learn more: http://bit.ly/kaggle-creds
Your Kaggle username:Your Kaggle Key:Downloading steam-games-dataset.zip to ./steam-games-dataset


100%|██████████| 187M/187M [00:39<00:00, 4.89MB/s] 





In [15]:
file = "steam-games-dataset/games.csv"
df_loaded = pd.read_csv(file)
df_loaded.head()

Unnamed: 0,AppID,Name,Release date,Estimated owners,Peak CCU,Required age,Price,DLC count,About the game,Supported languages,...,Average playtime two weeks,Median playtime forever,Median playtime two weeks,Developers,Publishers,Categories,Genres,Tags,Screenshots,Movies
0,20200,Galactic Bowling,"Oct 21, 2008",0 - 20000,0,0,19.99,0,Galactic Bowling is an exaggerated and stylize...,['English'],...,0,0,0,Perpetual FX Creative,Perpetual FX Creative,"Single-player,Multi-player,Steam Achievements,...","Casual,Indie,Sports","Indie,Casual,Sports,Bowling",https://cdn.akamai.steamstatic.com/steam/apps/...,http://cdn.akamai.steamstatic.com/steam/apps/2...
1,655370,Train Bandit,"Oct 12, 2017",0 - 20000,0,0,0.99,0,THE LAW!! Looks to be a showdown atop a train....,"['English', 'French', 'Italian', 'German', 'Sp...",...,0,0,0,Rusty Moyher,Wild Rooster,"Single-player,Steam Achievements,Full controll...","Action,Indie","Indie,Action,Pixel Graphics,2D,Retro,Arcade,Sc...",https://cdn.akamai.steamstatic.com/steam/apps/...,http://cdn.akamai.steamstatic.com/steam/apps/2...
2,1732930,Jolt Project,"Nov 17, 2021",0 - 20000,0,0,4.99,0,Jolt Project: The army now has a new robotics ...,"['English', 'Portuguese - Brazil']",...,0,0,0,Campião Games,Campião Games,Single-player,"Action,Adventure,Indie,Strategy",,https://cdn.akamai.steamstatic.com/steam/apps/...,http://cdn.akamai.steamstatic.com/steam/apps/2...
3,1355720,Henosis™,"Jul 23, 2020",0 - 20000,0,0,5.99,0,HENOSIS™ is a mysterious 2D Platform Puzzler w...,"['English', 'French', 'Italian', 'German', 'Sp...",...,0,0,0,Odd Critter Games,Odd Critter Games,"Single-player,Full controller support","Adventure,Casual,Indie","2D Platformer,Atmospheric,Surreal,Mystery,Puzz...",https://cdn.akamai.steamstatic.com/steam/apps/...,http://cdn.akamai.steamstatic.com/steam/apps/2...
4,1139950,Two Weeks in Painland,"Feb 3, 2020",0 - 20000,0,0,0.0,0,ABOUT THE GAME Play as a hacker who has arrang...,"['English', 'Spanish - Spain']",...,0,0,0,Unusual Games,Unusual Games,"Single-player,Steam Achievements","Adventure,Indie","Indie,Adventure,Nudity,Violent,Sexual Content,...",https://cdn.akamai.steamstatic.com/steam/apps/...,http://cdn.akamai.steamstatic.com/steam/apps/2...


In [16]:
df_loaded.columns

Index(['AppID', 'Name', 'Release date', 'Estimated owners', 'Peak CCU',
       'Required age', 'Price', 'DLC count', 'About the game',
       'Supported languages', 'Full audio languages', 'Reviews',
       'Header image', 'Website', 'Support url', 'Support email', 'Windows',
       'Mac', 'Linux', 'Metacritic score', 'Metacritic url', 'User score',
       'Positive', 'Negative', 'Score rank', 'Achievements', 'Recommendations',
       'Notes', 'Average playtime forever', 'Average playtime two weeks',
       'Median playtime forever', 'Median playtime two weeks', 'Developers',
       'Publishers', 'Categories', 'Genres', 'Tags', 'Screenshots', 'Movies'],
      dtype='object')

Let's simplify the process and limit our data usage by choosing only important columns

In [30]:
df = df_loaded[
    [
        "Name",
        "Price",
        "Release date",
        "Metacritic score",
        "Estimated owners",
        "Average playtime forever",
        "About the game",
        "Tags",
    ]
].dropna(how="any")

df.head()

Unnamed: 0,Name,Price,Release date,Metacritic score,Estimated owners,Average playtime forever,About the game,Tags
0,Galactic Bowling,19.99,"Oct 21, 2008",0,0 - 20000,0,Galactic Bowling is an exaggerated and stylize...,"Indie,Casual,Sports,Bowling"
1,Train Bandit,0.99,"Oct 12, 2017",0,0 - 20000,0,THE LAW!! Looks to be a showdown atop a train....,"Indie,Action,Pixel Graphics,2D,Retro,Arcade,Sc..."
3,Henosis™,5.99,"Jul 23, 2020",0,0 - 20000,0,HENOSIS™ is a mysterious 2D Platform Puzzler w...,"2D Platformer,Atmospheric,Surreal,Mystery,Puzz..."
4,Two Weeks in Painland,0.0,"Feb 3, 2020",0,0 - 20000,0,ABOUT THE GAME Play as a hacker who has arrang...,"Indie,Adventure,Nudity,Violent,Sexual Content,..."
5,Wartune Reborn,0.0,"Feb 26, 2021",0,50000 - 100000,0,Feel tired of auto-fight? Feel tired of boring...,"Turn-Based Combat,Massively Multiplayer,Multip..."


In [31]:
df["Metacritic score"] = df["Metacritic score"].astype(int)
df["Average playtime forever"] = df["Average playtime forever"].astype(int)

In [32]:
df.shape

(63923, 8)

Let's reduce our dataset

In [33]:
df = df.loc[(df["Metacritic score"] > 0) & (df["Average playtime forever"] > 0)]
df.shape

(3207, 8)

In [35]:
# convert df to strings because it doesn't detect NaN values
df = df.map(lambda x: len(str(x)), na_action="ignore")
df.fillna("UNK", inplace=True)

In [45]:
from pymilvus import (
    connections,
    Collection,
    FieldSchema,
    CollectionSchema,
    DataType,
    utility,
)

connections.connect("default", uri=milvus_uri, token=token)
print("Connected!")

Connected!


In [40]:
COLLECTION_NAME = "games_vectors"
collection = Collection(name=COLLECTION_NAME)

In [42]:
df.columns

Index(['Name', 'Price', 'Release date', 'Metacritic score', 'Estimated owners',
       'Average playtime forever', 'About the game', 'Tags'],
      dtype='object')

In [77]:
# Define the fields
id = FieldSchema(name="name", dtype=DataType.VARCHAR, max_length=500, is_primary=True)
field = FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=384)

schema = CollectionSchema(
    fields=[id, field],
    description="steam games recommender: game vectors",
    enable_dynamic_field=True,
)

if utility.has_collection(COLLECTION_NAME):  # drop the same collection created before
    collection = Collection(COLLECTION_NAME)
    collection.drop()

collection = Collection(name=COLLECTION_NAME, schema=schema)
print("Collection created.")

index_params = {
    "index_type": "IVF_FLAT",
    "metric_type": "L2",
    "params": {"nlist": 128},
}

collection.create_index(field_name="embedding", index_params=index_params)
collection.load()

print("Collection indexed!")

Collection created.
Collection indexed!


In [93]:
games_dict = df.replace(np.nan, "Unknown").to_dict(
    "records"
)  # It doesn't accept nan values

In [43]:
df.columns

Index(['Name', 'Price', 'Release date', 'Metacritic score', 'Estimated owners',
       'Average playtime forever', 'About the game', 'Tags'],
      dtype='object')

In [44]:
transformer = SentenceTransformer("all-MiniLM-L6-v2")


def embed_game(data):
    embed = "{} Released on {}. Genres are {}. The price is {}. It's Metcric score is {}.  The 'average playtime is {}. It's Estimated owners are {}".format(
        data["Name"],
        data["Release date"],
        data["Tags"],
        data["price"],
        data["Metacritic score"],
        data["Average playtime forever"],
        data["Estimated owners"],
    )
    embeddings = transformer.encode(embed)
    return embeddings

  return self.fget.__get__(instance, owner)()


In [95]:
# Loop counter for batching and showing progress
j = 0
batch = []

for game_dict in tqdm(games_dict):
    try:
        game_dict["embedding"] = embed_game(game_dict)
        batch.append(game_dict)
        j += 1
        if j % 5 == 0:
            print("Embedded {} records".format(j))
            collection.insert(batch)
            print("Batch insert completed")
            batch = []
    except Exception as e:
        print("Error inserting record {}".format(e))
        pprint(batch)
        break  # or pass to continue

collection.insert(game_dict)
print("Final batch completed")
print("Finished with {} embeddings".format(j))

Embedded 5 records
Batch insert completed
Embedded 10 records
Batch insert completed
Embedded 15 records
Batch insert completed
Embedded 20 records
Batch insert completed
Embedded 25 records
Batch insert completed
Embedded 30 records
Batch insert completed
Embedded 35 records
Batch insert completed
Embedded 40 records
Batch insert completed
Embedded 45 records
Batch insert completed
Embedded 50 records
Batch insert completed
Embedded 55 records
Batch insert completed
Embedded 60 records
Batch insert completed
Embedded 65 records
Batch insert completed
Embedded 70 records
Batch insert completed
Embedded 75 records
Batch insert completed
Embedded 80 records
Batch insert completed
Embedded 85 records
Batch insert completed
Embedded 90 records
Batch insert completed
Embedded 95 records
Batch insert completed
Embedded 100 records
Batch insert completed
Embedded 105 records
Batch insert completed
Embedded 110 records
Batch insert completed
Embedded 115 records
Batch insert completed
Embedded

RPC error: [insert_rows], <MilvusException: (code=1100, message=the length (124735) of dynamic field exceeds max length (65536): expected=valid length dynamic field, actual=length exceeds max length: invalid parameter)>, <Time:{'RPC start': '2023-12-04 23:21:06.726607', 'RPC error': '2023-12-04 23:21:07.470755'}>


Error inserting record <MilvusException: (code=1100, message=the length (124735) of dynamic field exceeds max length (65536): expected=valid length dynamic field, actual=length exceeds max length: invalid parameter)>
[{'Single-Player_Main Story_Average': 'Unknown',
  'date': '27 November 2019',
  'embedding': array([-3.50488164e-02,  1.47976782e-02, -3.32125686e-02, -8.18991587e-02,
       -4.64994386e-02,  2.04030983e-02,  2.24939119e-02,  3.45539302e-02,
        4.40812297e-03,  2.21107882e-02,  7.84198046e-02, -9.07179725e-04,
        8.21369216e-02, -2.01400481e-02, -1.08046103e-02,  4.61574607e-02,
       -1.20758517e-02,  6.39086291e-02, -4.98277135e-02, -3.16922888e-02,
       -2.49197129e-02, -8.29130486e-02, -3.15026287e-03,  1.55027285e-02,
       -4.44341898e-02, -1.78049002e-02, -3.60276736e-02,  1.28647208e-01,
       -8.69840570e-03, -5.72326705e-02,  1.98818836e-02,  6.23563640e-02,
       -6.29066378e-02,  4.39002551e-02,  2.70123724e-02,  1.76002495e-02,
        2.3167

In [85]:
from pymilvus import *

Connected!


In [59]:
collection.load()  # load collection memory before search

# Set search parameters
topK = 5
SEARCH_PARAM = {
    "metric_type": "L2",
    "params": {"nprobe": 20},
}


def embed_search(search_string):
    search_embeddings = transformer.encode(search_string)
    return search_embeddings


def search_for_games(search_string):
    user_vector = embed_search(search_string)
    return collection.search(
        [user_vector],
        "embedding",
        param=SEARCH_PARAM,
        limit=topK,
        expr=None,
        output_fields=["name", "full_desc"],
    )


def convert_hits_json(hits_list):
    hits = hits_list[0]

    # Extract the list of dictionaries from the Hits object
    games_info_list = [hit.to_dict() for hit in hits]

    # Convert the list to a JSON string
    games_info_str = json.dumps(games_info_list)

    # Parse the string into a list of dictionaries
    games_info = json.loads(games_info_str)

    for entry in games_info:
        entry["entity"]["full_desc"] = entry["entity"]["full_desc"].strip()

    return games_info

In [36]:
search_string = "A football sports game."
results = search_for_games(search_string)
# Assuming results is a list containing a single Hits object

# Iterate through the list to extract information
for game_info in results:
    title = game_info["entity"]["name"]
    overview = game_info["entity"]["full_desc"]

    # Print or do whatever you need with the information
    print(f"Title: {title}")
    print(f"Overview: {overview}")
    print("-----")

Title: Draft Day Sports: Pro Football 2020
Overview: About This GameDraft Day Sports: Pro Football 2020 puts you in control of your favorite pro football franchise. You make the calls as you build your dynasty – build your roster through trades, the draft, and free agency. Analyze the impressive array of data to determine how to put together your own custom playbook and strategies to lead your team to victory. Even show off your coaching knowledge by creating your own plays! Watch the action unfold in dramatic 2D fashion where you can take control of the play calling and watch your calls play out in front of you. Play by yourself against a challenging AI or join an online multiplayer league and see if you have what it takes to outmanage your fellow gamers.
-----
Title: Behold the Kickmen
Overview: About This GameBEHOLD THE KICKMEN started as a silly Twitter joke, and evolved like a beautiful flower into the year's least-exciting and mostly-incorrect Football Simulation. It's football, 