In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel

# Step 1: Load and Inspect the Dataset
metadata = pd.read_csv(r"steam.csv")  # Replace with your file path

# Step 2: Data Preprocessing
# Fill missing values in text columns with empty strings
metadata['description'] = metadata['description'].fillna('')
metadata['tags'] = metadata['tags'].fillna('')
metadata['developer'] = metadata['developer'].fillna('')

# Combine relevant textual features into one column
metadata['combined_features'] = (
    metadata['description'] + ' ' +
    metadata['tags']
)

In [3]:
# Step 3: Feature Engineering
# Create TF-IDF matrix for combined features
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(metadata['combined_features'])

# Compute cosine similarity
cosine_sim = linear_kernel(tfidf_matrix,tfidf_matrix)

In [4]:
# Step 4: Build the Recommendation Function
# Create an index mapping for game names
indices = pd.Series(metadata.index, index=metadata['name']).drop_duplicates()

def recommend_games(game_name, cosine_sim=cosine_sim):
    """
    Recommend games similar to the input game based on cosine similarity.
    :param game_name: Name of the input game
    :param cosine_sim: Cosine similarity matrix
    :return: List of recommended game names
    """
    # Get the index of the game that matches the title
    idx = indices[game_name]

    # Get the pairwise similarity scores for this game with all other games
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the games by similarity score (highest first)
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the indices of the top 10 most similar games (excluding the input game itself)
    sim_scores = sim_scores[1:11]  # Top 10 excluding the first result (self)
    game_indices = [i[0] for i in sim_scores]

    # Return the names of the top 10 similar games
    return metadata['name'].iloc[game_indices]

# Step 5: Test the Recommender System
game_name = "Dota 2"  # Replace with a game name from your dataset
recommendations = recommend_games(game_name)
print(f"Games similar to '{game_name}':\n", recommendations)

Games similar to 'Dota 2':
 7085                                         Roads of Rome
967                                                  HOARD
14479                                           VectorWave
13108                         Greyhound Manager 2 Rebooted
1981                                         Anomaly Korea
14615    Phantasmat: The Endless Night Collector's Edition
24299                                   The Legend Of Vraz
25104                   The Qaedon Wars - The Story Begins
3919                                          Pyrite Heart
2346                                            Last Dream
Name: name, dtype: object


In [8]:
# Limit the dataset size
metadata = metadata.head(5000)  # Adjust the size as needed

# Compute TF-IDF matrix
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(metadata['combined_features'])

# Compute sparse cosine similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
threshold = 0.2
cosine_sim[cosine_sim < threshold] = 0  # Apply threshold
sparse_cosine_sim = csr_matrix(cosine_sim)

# Save the sparse matrix
joblib.dump(sparse_cosine_sim, 'cosine_sim_sparse.joblib')
print("Sparse cosine similarity matrix saved!")


Sparse cosine similarity matrix saved!
