# Model Pipeline

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

def preprocess_data(df):
    # Create a copy to avoid modifying the original TODO use original later
    processed_df = df.copy()
    processed_df = processed_df.drop(['appid', 'average_forever', 'average_2weeks', 'median_forever', 'median_2weeks', 'userscore', 'score_rank', 'initialprice', 'languages'], axis=1)
    processed_df = pd.get_dummies(processed_df, columns=['owners', "publisher", "developer"])
    
    # Handle missing values in price
    processed_df['price'] = processed_df['price'].fillna(processed_df['price'].median())
    
    # Scale the price and positive features
    scaler = StandardScaler()
    processed_df['price_scaled'] = scaler.fit_transform(processed_df[['price']])
    processed_df['positive_scaled'] = scaler.fit_transform(processed_df[['positive']])
    
    return processed_df

def engineer_features(df):
    # Include both scaled price and positive in similarity features
    # similarity_features = ['price_scaled', 'positive_scaled']
    similarity_features = df.columns
    
    # Could play around with dropping some features here
    return df, similarity_features

def build_similarity_matrix(df, features=None, feature_weights=None):
    """
    - feature_weights: Dictionary mapping feature names to weights
                      If a feature is not in this dict, it will receive an equal share
                      of the remaining weight
    """
    # If no features specified, use all columns in the dataframe
    if features is None:
        features = df.columns.tolist()
    
    # Extract the feature matrix
    feature_matrix = df[features].values
    
    # Initialize weights array
    weights_array = np.ones((1, len(features)))
    
    # Apply weights if provided
    if feature_weights is not None:
        # Calculate total weight already assigned
        total_assigned_weight = sum(weight for feature, weight in feature_weights.items() 
                                   if feature in features)
        
        # Count features without specified weights
        unspecified_features = [f for f in features if f not in feature_weights]
        num_unspecified = len(unspecified_features)
        
        # Calculate weight for each unspecified feature
        remaining_weight = 1.0 - total_assigned_weight
        default_weight = remaining_weight / num_unspecified if num_unspecified > 0 else 0
        
        # Apply weights to each feature
        for i, feature in enumerate(features):
            if feature in feature_weights:
                weights_array[0, i] = feature_weights[feature]
            else:
                weights_array[0, i] = default_weight
    
    # Apply weights to feature matrix
    weighted_matrix = feature_matrix * weights_array
    
    # Calculate cosine similarity using the weighted matrix
    similarity_matrix = cosine_similarity(weighted_matrix)
    
    # Create a DataFrame for easier indexing
    similarity_df = pd.DataFrame(
        similarity_matrix,
        index=df.index,
        columns=df.index
    )
    
    return similarity_df



In [3]:
def get_recommendations(game_idx, similarity_df, df, n=5):
    print(f"Getting recommendations for index {game_idx} and name {df['name'].iloc[0]}")
    # Check if the game exists in our data
    if game_idx not in similarity_df.index:
        return f"Game with index {game_idx} not found in the database."
    # Get similarity scores for the game
    similarity_scores = similarity_df.loc[game_idx].sort_values(ascending=False)
    
    # Get top N similar games (excluding the game itself)
    similar_games = similarity_scores.iloc[1:n+1]
    
    # Get details of recommended games
    recommendations = df.loc[similar_games.index]
    
    # Add similarity score to recommendations
    recommendations = recommendations.copy()
    recommendations['similarity_score'] = similar_games.values
    
    # Sort by similarity score
    recommendations = recommendations.sort_values('similarity_score', ascending=False)
    print(recommendations.head())
    
    return recommendations[['name', 'price', 'similarity_score']]

def create_recommender_system(df):
    # TODO: add calls to preprocess_data, engineer_features, and build_similarity_matrix here when pipeline ready
    processed_df = preprocess_data(df)

    df, similarity_features = engineer_features(processed_df)

    print("Returning early")
    print(df.head(1))
    print(similarity_features)
    return -1,-1
    # Define weights as a dictionary mapping feature names to weights
    feature_weights = {
        'price_scaled': 0.1  # This assigns 0.1 weight to price_scaled
        # The remaining 0.9 weight will be distributed among other features
    }

    # Call the function with the dictionary of weights
    similarity_df = build_similarity_matrix(df, similarity_features, feature_weights)
    
    # Create a recommendation function with pre-loaded data
    def recommend(game_idx, n=5):
        return get_recommendations(game_idx, similarity_df, df, n)
    
    # Alternative function that accepts a game name
    def recommend_by_name(game_name, n=5):
        # Find the game index
        if game_name not in df['name'].values:
            return f"Game '{game_name}' not found in the database."
        
        game_idx = df[df['name'] == game_name].index[0]
        return get_recommendations(game_idx, similarity_df, df, n)
    
    return recommend, recommend_by_name


In [4]:
df = pd.read_csv("../Data\Top 1000 Steam Games 2023 export 2025-07-09 14-37-02.csv")
# Encode categorical variables
df = df[:20]

df.head(1)

  df = pd.read_csv("../Data\Top 1000 Steam Games 2023 export 2025-07-09 14-37-02.csv")


Unnamed: 0,appid,name,developer,publisher,score_rank,positive,negative,userscore,owners,average_forever,average_2weeks,median_forever,median_2weeks,price,initialprice,discount,languages,genre,ccu,tags
0,10,Counter-Strike,Valve,Valve,,216345,5530,0,"10,000,000 .. 20,000,000",0,0,0,0,999,999,0,"English, French, German, Italian, Spanish - Sp...",Action,10775,"{'Action': 5448, 'FPS': 4862, 'Multiplayer': 3..."


In [5]:
# def recommend(game_idx, n=5):
#     return get_recommendations(game_idx, similarity_df, df, n)

# # Alternative function that accepts a game name
# def recommend_by_name(game_name, n=5):
#     # Find the game index
#     if game_name not in df['name'].values:
#         return f"Game '{game_name}' not found in the database."
    
#     game_idx = df[df['name'] == game_name].index[0]
#     return get_recommendations(game_idx, similarity_df, df, n)

# # Create the recommender
# processed_df = preprocess_data(df)

# df, similarity_features = engineer_features(processed_df)

# # Define weights as a dictionary mapping feature names to weights
# feature_weights = {
#     'price_scaled': 0.1  # This assigns 0.1 weight to price_scaled
#     # The remaining 0.9 weight will be distributed among other features
# }

# # Call the function with the dictionary of weights
# similarity_df = build_similarity_matrix(df, similarity_features, feature_weights)
recommend_by_idx, recommend_by_name = create_recommender_system(df)

Returning early
             name  positive  negative  price  discount   genre    ccu  \
0  Counter-Strike    216345      5530    999         0  Action  10775   

                                                tags  \
0  {'Action': 5448, 'FPS': 4862, 'Multiplayer': 3...   

   owners_10,000,000 .. 20,000,000  owners_2,000,000 .. 5,000,000  \
0                             True                          False   

   owners_20,000,000 .. 50,000,000  owners_5,000,000 .. 10,000,000  \
0                            False                           False   

   owners_50,000,000 .. 100,000,000  publisher_Valve  \
0                             False             True   

   developer_Gearbox Software  developer_Valve  price_scaled  positive_scaled  
0                       False             True      0.912981         0.627441  
Index(['name', 'positive', 'negative', 'price', 'discount', 'genre', 'ccu',
       'tags', 'owners_10,000,000 .. 20,000,000',
       'owners_2,000,000 .. 5,000,000', 'owne

In [6]:
# Get recommendations by index
game_idx = 12
recommendations_by_idx = recommend_by_idx(game_idx, n=10)
print(f"Recommendations for game at index {game_idx} based on price:")
print(recommendations_by_idx)

# Get recommendations by name
game_name = 'Counter-Strike'
recommendations_by_name = recommend_by_name(game_name, n=3)
print(f"\nRecommendations for '{game_name}' based on price:")
print(recommendations_by_name)

# Evaluate the recommender
test_indices = [0, 2, 4]  # Indices of some games
evaluation = evaluate_recommender(df, test_indices)
print("\nEvaluation results:")
print(evaluation)

TypeError: 'int' object is not callable