In [None]:
import pandas as pd
import joblib
from sklearn.preprocessing import StandardScaler
import numpy as np

# Define features used in training
features = [
    'metacritic',
    'steam_achievements',
    'steam_trading_cards',
    'workshop_support',
    'achievements_total',
    'price',
    'dlc_count',
    'copiesSold',
    'hasDemo',
    'release_year',
    'publisherClass_AA',
    'publisherClass_AAA',
    'publisherClass_Hobbyist',
    'publisherClass_Indie',
    'isWindows',
    'isMac',
    'isLinux'
]

# Function to preprocess new data
def preprocess_new_data(df_new, scaler):
    # Ensure all required features are present
    for feature in features:
        if feature not in df_new.columns:
            df_new[feature] = 0  # Fill missing features with 0 (reasonable default)
    # Handle data types and missing values
    df_new['metacritic'] = pd.to_numeric(df_new['metacritic'], errors='coerce').fillna(0)
    df_new['achievements_total'] = pd.to_numeric(df_new['achievements_total'], errors='coerce').fillna(0)
    df_new['steam_achievements'] = df_new['steam_achievements'].astype(int)
    df_new['steam_trading_cards'] = df_new['steam_trading_cards'].astype(int)
    df_new['workshop_support'] = df_new['workshop_support'].astype(int)
    df_new['release_year'] = pd.to_numeric(df_new['release_year'], errors='coerce').fillna(0)
    df_new['dlc_count'] = df_new['dlc_count'].fillna(0)
    df_new['hasDemo'] = df_new['hasDemo'].fillna(0)
    # Scale copiesSold
    df_new['copiesSold'] = scaler.transform(df_new[['copiesSold']].fillna(0))
    # Select only the required features
    X_new = df_new[features]
    return X_new, df_new

# Main function to load models and predict
def predict_new_data(input_csv='new_games_data.csv', output_csv='predictions.csv'):
    # Load scaler
    try:
        scaler = joblib.load('Regression scaler.pkl')
    except FileNotFoundError:
        print("Scaler file 'Regression scaler.pkl' not found. Please ensure it exists.")
        return
    
    # Load new data
    try:
        df_new = pd.read_csv(input_csv)
    except FileNotFoundError:
        print(f"Input file {input_csv} not found.")
        return
    
    # Preprocess new data
    X_new, df_new = preprocess_new_data(df_new, scaler)
    
    # Load models
    models = {
        'XGBoost Regressor': 'xgboost_regressor.pkl',
        'LightGBM Regressor': 'lightgbm_regressor.pkl'
    }
    predictions = {'id': df_new['id'], 'name': df_new['name']}
    if 'reviewScore' in df_new.columns:
        predictions['actual_reviewScore'] = df_new['reviewScore']
    
    for model_name, model_file in models.items():
        try:
            model = joblib.load(model_file)
            predictions[f'predicted_reviewScore_{model_name.lower().replace(" ", "_")}'] = model.predict(X_new)
        except FileNotFoundError:
            print(f"Model file {model_file} not found. Skipping {model_name}.")
            continue
    
    # Save predictions
    pd.DataFrame(predictions).to_csv(output_csv, index=False)
    print(f"Predictions saved to {output_csv}")

if __name__ == "__main__":
    # Run predictions on new data
    predict_new_data()