In [15]:
import pandas as pd
import pickle
import numpy as np
import os
from sklearn.preprocessing import StandardScaler, LabelEncoder
from datetime import datetime

# Define features used in training (selected by SelectKBest)
selected_features = [
    'metacritic', 'steam_achievements', 'steam_trading_cards', 'workshop_support',
    'achievements_total', 'dlc_count', 'hasDemo', 'release_year', 'game_age',
    'isMac', 'num_platforms', 'price_per_dlc', 'achievements_per_dlc',
    'log_copiesSold', 'log_price', 'publisherClass_AA', 'publisherClass_AAA',
    'publisherClass_Hobbyist', 'publisherClass_Indie', 'price_category_Free',
    'price_category_Low', 'price_category_High', 'price_category_Very High',
    'Free To Play', 'Simulation', 'price_to_copies_ratio', 'game_complexity',
    'has_metacritic', 'genre_popularity', 'free_to_play_interaction'
]

# Numerical columns to scale
numerical_cols = [
    'metacritic', 'achievements_total', 'dlc_count', 'release_year', 'game_age',
    'num_platforms', 'price_per_dlc', 'achievements_per_dlc', 'log_copiesSold',
    'log_price', 'genre_diversity', 'release_month', 'release_quarter',
    'price_to_copies_ratio', 'game_complexity', 'genre_popularity'
]

def preprocess_new_data(df_new, scaler, label_encoder):
    # Handle missing values and data types
    df_new['metacritic'] = pd.to_numeric(df_new['metacritic'], errors='coerce').fillna(0)
    df_new['achievements_total'] = pd.to_numeric(df_new['achievements_total'], errors='coerce').fillna(0)
    df_new['dlc_count'] = df_new['dlc_count'].fillna(0)
    df_new['hasDemo'] = df_new['hasDemo'].fillna(0)
    df_new['release_date'] = pd.to_datetime(df_new['release_date'], errors='coerce')
    df_new['release_year'] = df_new['release_date'].dt.year.fillna(df_new['release_date'].dt.year.mode()[0])
    df_new['price'] = pd.to_numeric(df_new['price'], errors='coerce').fillna(0)
    df_new['copiesSold'] = pd.to_numeric(df_new['copiesSold'], errors='coerce').fillna(0)
    
    # Compute derived features
    current_year = datetime.now().year
    df_new['game_age'] = current_year - df_new['release_year']
    
    # Add release_month and release_quarter
    df_new['release_month'] = df_new['release_date'].dt.month.fillna(df_new['release_date'].dt.month.mode()[0])
    df_new['release_quarter'] = df_new['release_date'].dt.quarter.fillna(df_new['release_date'].dt.quarter.mode()[0])
    
    # genre_diversity (approximate: count of genres or simplified)
    if 'genres' in df_new.columns:
        df_new['genre_diversity'] = df_new['genres'].apply(lambda x: len(str(x).split(',')) if pd.notna(x) else 1)
    else:
        df_new['genre_diversity'] = 1
    
    # Price categories
    df_new['price_category'] = pd.cut(
        df_new['price'],
        bins=[-1, 0, 5, 15, 30, 60, float('inf')],
        labels=['Free', 'Low', 'Medium', 'High', 'Very High', 'Premium']
    )
    
    # Platform features
    df_new['isWindows'] = df_new['supported_platforms'].apply(lambda x: 1 if 'windows' in str(x).lower() else 0)
    df_new['isMac'] = df_new['supported_platforms'].apply(lambda x: 1 if 'mac' in str(x).lower() else 0)
    df_new['isLinux'] = df_new['supported_platforms'].apply(lambda x: 1 if 'linux' in str(x).lower() else 0)
    df_new['num_platforms'] = df_new[['isWindows', 'isMac', 'isLinux']].sum(axis=1)
    
    # DLC and achievements features
    df_new['price_per_dlc'] = df_new['price'] / (df_new['dlc_count'] + 1)
    df_new['achievements_per_dlc'] = df_new['achievements_total'] / (df_new['dlc_count'] + 1)
    
    # Log transformations
    df_new['log_copiesSold'] = np.log1p(df_new['copiesSold'].clip(lower=0))
    df_new['log_price'] = np.log1p(df_new['price'].clip(lower=0))
    
    # Cap outliers
    def cap_outliers(series, lower_quantile=0.01, upper_quantile=0.99):
        lower = series.quantile(lower_quantile)
        upper = series.quantile(upper_quantile)
        return series.clip(lower=lower, upper=upper)
    
    df_new['copiesSold'] = cap_outliers(df_new['copiesSold'])
    df_new['price'] = cap_outliers(df_new['price'])
    df_new['achievements_total'] = cap_outliers(df_new['achievements_total'])
    df_new['dlc_count'] = cap_outliers(df_new['dlc_count'])
    
    # Encode categorical columns (boolean encoding)
    categorical_columns = ['steam_achievements', 'steam_trading_cards', 'workshop_support']
    for col in categorical_columns:
        df_new[col] = df_new[col].astype(bool).astype(int)
    
    # Handle genres
    if 'genres' in df_new.columns:
        df_new['Free To Play'] = df_new['genres'].str.contains('Free To Play', case=False, na=False).astype(int)
        df_new['Simulation'] = df_new['genres'].str.contains('Simulation', case=False, na=False).astype(int)
    else:
        df_new['Free To Play'] = 0
        df_new['Simulation'] = 0
    
    # Publisher success score
    df_new['publisher_success_score'] = df_new['publisherClass'].map({
        'AA': 5.0, 'AAA': 6.0, 'Hobbyist': 3.0, 'Indie': 4.0
    }).fillna(4.0)
    
    # Other derived features
    df_new['price_to_copies_ratio'] = df_new['price'] / (df_new['log_copiesSold'] + 1e-6)
    df_new['game_complexity'] = (
        df_new['achievements_total'] / (df_new['achievements_total'].max() + 1e-6) +
        df_new['dlc_count'] / (df_new['dlc_count'].max() + 1e-6) +
        df_new['genre_diversity'] / (df_new['genre_diversity'].max() + 1e-6)
    )
    df_new['has_metacritic'] = (df_new['metacritic'] > 0).astype(int)
    df_new['free_to_play_interaction'] = df_new['price_category'].apply(lambda x: 1 if x == 'Free' else 0) * df_new['Free To Play']
    
    # One-hot encode publisherClass and price_category
    df_new = pd.get_dummies(df_new, columns=['publisherClass', 'price_category'], dtype=int)
    
    # Genre popularity
    genre_cols = ['Free To Play', 'Simulation']
    genre_popularity = {'Free To Play': 50000, 'Simulation': 30000}
    df_new['genre_popularity'] = df_new[genre_cols].dot(list(genre_popularity.values())) / df_new[genre_cols].sum(axis=1).replace(0, 1)
    
    # Ensure all selected features are present
    for feature in selected_features:
        if feature not in df_new.columns:
            df_new[feature] = 0
    
    # Scale numerical features
    df_new[numerical_cols] = scaler.transform(df_new[numerical_cols])
    
    # Select only the required features
    X_new = df_new[selected_features]
    
    # Encode reviewScore if present (for comparison)
    encoded_reviewScore = None
    if 'reviewScore' in df_new.columns:
        try:
            encoded_reviewScore = label_encoder.transform(df_new['reviewScore'])
        except ValueError as e:
            print(f"Error encoding reviewScore: {e}. Ensure test reviewScore values match training.")
            encoded_reviewScore = df_new['reviewScore']
    
    return X_new, df_new, encoded_reviewScore

def predict_new_data(input_csv='G:\\Spring 2025\\Machine learning\\Project\\Milestone 2\\Classification_Dataset\\test.csv', output_csv='Classification_Predictions\\Classification predictions.csv'):
    # Load scaler
    try:
        with open('Classification_Saved_Scaler\Classification scaler.pkl', 'rb') as f:
            scaler = pickle.load(f)
    except FileNotFoundError as e:
        print(f"Scaler file not found: {e}")
        return
    
    # Load label encoder
    try:
        with open('Classification_Saved_Encoder\Classification label_encoder.pkl', 'rb') as f:
            label_encoder = pickle.load(f)
    except FileNotFoundError:
        print("Label encoder file 'Classification label_encoder.pkl' not found.")
        return
    
    # Load new data
    try:
        df_new = pd.read_csv(input_csv)
    except FileNotFoundError:
        print(f"Input file {input_csv} not found.")
        return
    
    # Create output directory if it doesn't exist
    os.makedirs(os.path.dirname(output_csv), exist_ok=True)
    
    # Preprocess new data
    X_new, df_new, encoded_reviewScore = preprocess_new_data(df_new, scaler, label_encoder)
    
    # Load models
    model_dir = 'G:\\Spring 2025\\Machine learning\\Project\\Milestone 2\\Classification_Saved_Models'
    models = {
        'CatBoost': 'Classification_Saved_Models\\catboost_model.pkl',
        'XGBoost': 'Classification_Saved_Models\\xgboost_model.pkl',
        'LightGBM': 'Classification_Saved_Models\\lightgbm_model.pkl'
    }
    predictions = {'id': df_new['id'], 'name': df_new['name']}
    if 'reviewScore' in df_new.columns:
        predictions['actual_reviewScore'] = df_new['reviewScore']
    
    for model_name, model_file in models.items():
        try:
            with open(model_file, 'rb') as f:
                model = pickle.load(f)
            # Predict and decode labels
            y_pred = model.predict(X_new)
            predictions[f'predicted_reviewScore_{model_name.lower()}'] = label_encoder.inverse_transform(y_pred)
        except FileNotFoundError:
            print(f"Model file {model_file} not found. Skipping {model_name}.")
            continue
    
    # Save predictions
    pd.DataFrame(predictions).to_csv(output_csv, index=False)
    print(f"Predictions saved to {output_csv}")

if __name__ == "__main__":
    predict_new_data()

  y = column_or_1d(y, warn=True)


Predictions saved to Classification_Predictions\Classification predictions.csv
