In [3]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
import pandas as pd
import re
from sklearn.model_selection import train_test_split

# Load and clean the data
file_path = '/kaggle/input/80000-steam-games-dataset/steam_data.csv'
steam_data = pd.read_csv(file_path)

# Define functions to parse the dataset
def parse_price_strict(price):
    if 'free' in str(price).lower():
        return 0.0
    match = re.search(r"\d+(\.\d+)?", str(price))
    return float(match.group()) if match else None

def parse_score_percentage(user_reviews):
    match = re.search(r"(\d{1,2})%", str(user_reviews))
    return int(match.group(1)) if match else None

def parse_review_sentiment(user_reviews):
    sentiment_match = re.search(r"^(.*?)\(", str(user_reviews))
    return sentiment_match.group(1).strip() if sentiment_match else None

# Apply parsing functions
steam_data['price_parsed'] = steam_data['price'].apply(parse_price_strict)
steam_data['score_percentage'] = steam_data['user_reviews'].apply(parse_score_percentage)
steam_data['review_sentiment'] = steam_data['user_reviews'].apply(parse_review_sentiment)

# Drop rows with missing values in relevant columns
steam_data_cleaned = steam_data.dropna(subset=['price_parsed', 'score_percentage'])

# One-hot encode the categories column
# Split each entry in the 'categories' column by commas and create binary columns for each unique category
categories_dummies = steam_data_cleaned['categories'].str.get_dummies(sep=',')

# Concatenate the new binary columns to the main dataset
steam_data_cleaned = pd.concat([steam_data_cleaned, categories_dummies], axis=1)

# Prepare the data for modeling
# Extract relevant features for analysis
X = steam_data_cleaned[['price_parsed', 'review_sentiment'] + list(categories_dummies.columns)]  # Include category columns
y = steam_data_cleaned['score_percentage']

# Convert categorical columns (like sentiment) to numeric using one-hot encoding
X = pd.get_dummies(X, columns=['review_sentiment'], drop_first=True)

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Define the model (change to DecisionTreeRegressor if needed)
model = RandomForestRegressor(max_depth=10, min_samples_leaf=2, min_samples_split=10, n_estimators=200, random_state=0)

# Use the best model to make predictions
best_model = model.fit(X_train,y_train)
y_pred = best_model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print("Test Mean Squared Error with Best Model:", mse)

Test Mean Squared Error with Best Model: 422.509796150918


In [6]:
import pickle
# Save model and clean data for reuse if necessary
with open('steam_ensemble_model.pkl', 'wb') as file:
    pickle.dump(model, file)
steam_data_cleaned.to_csv('cleaned_steam_data.csv', index=False)