In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
from xgboost import XGBRegressor

# Load and clean the data
file_path = '/kaggle/input/80000-steam-games-dataset/steam_data.csv'
steam_data = pd.read_csv(file_path)

In [2]:
# Parse price column
def parse_price_strict(price):
    if 'free' in str(price).lower():
        return 0.0
    match = re.search(r"\d+(\.\d+)?", str(price))
    return float(match.group()) if match else None

steam_data['price_parsed'] = steam_data['price'].apply(parse_price_strict)

# Parse score percentage
def parse_score_percentage(user_reviews):
    match = re.search(r"(\d{1,2})%", str(user_reviews))
    return int(match.group(1)) if match else None

steam_data['score_percentage'] = steam_data['user_reviews'].apply(parse_score_percentage)

# Parse categories
exception_categories = [
    "Co-op", "Single-player", "LAN", "Controller Support", "Online Multiplayer", "Remote Play", 
    "Steam Workshop", "Downloadable Content", "Steam Cards", "Trading Cards", "VR", "MMORPG", "VR Support",
    "PvE", "PvP", "PVP", "PVE", "Split Screen", "Shared Screen", "Split/Shared Screen", "Cross Platform Play"
]

def split_by_capitalized_letters(categories):
    if pd.isna(categories) or not isinstance(categories, str):
        return []
    matches = re.findall(r'(?:' + '|'.join(exception_categories) + r')|[A-Z][a-z]*', categories)
    return matches

steam_data['parsed_categories'] = steam_data['categories'].apply(split_by_capitalized_letters)

# Generate one-hot encoded categories
unique_categories = set(cat for categories in steam_data['parsed_categories'] for cat in categories)
category_dummies = pd.DataFrame(
    {f'category_{category}': steam_data['parsed_categories'].apply(lambda x: 1 if category in x else 0)
     for category in unique_categories}
)
steam_data = pd.concat([steam_data, category_dummies], axis=1)

# Group developers and publishers
developer_counts = steam_data['developer'].value_counts()
publisher_counts = steam_data['publisher'].value_counts()

steam_data['developer_grouped'] = steam_data['developer'].apply(
    lambda x: 'Known' if pd.notna(x) and developer_counts.get(x, 0) >= 10 else 'Unknown'
)
steam_data['publisher_grouped'] = steam_data['publisher'].apply(
    lambda x: 'Known' if pd.notna(x) and publisher_counts.get(x, 0) >= 20 else 'Unknown'
)

# One-hot encode developer and publisher groups
developer_dummies = pd.get_dummies(steam_data['developer_grouped'], prefix='developer', drop_first=True)
publisher_dummies = pd.get_dummies(steam_data['publisher_grouped'], prefix='publisher', drop_first=True)

# Drop rows with missing values in key columns
steam_data_cleaned = steam_data.dropna(subset=['price_parsed', 'score_percentage', 'categories', 'developer', 'publisher'])
steam_data_cleaned = pd.concat([steam_data_cleaned, developer_dummies, publisher_dummies], axis=1)

# Dynamically validate and drop columns
columns_to_drop = ['category_Cloud', 'category_Tablet', 'category_Achievements', 
                   'category_Full', 'category_Includes', 'category_Profile', 'category_Limited', 
                   'category_Features', 'category_Phone', 'category_Quality', 'category_Stats', 
                   'category_Leaderboards', 'category_Content', 'category_Captions', 
                   'category_Together', 'category_High', 'category_Additional', 'category_Audio', 
                   'category_Partial', 'category_Tycoon', 'category_Requires', 'category_Shared', 
                   'category_Rocking', 'category_Frog', 'category_Riders', 'category_Grow', 
                   'category_Europa', 'category_Man', 'category_Art', 'category_Valve', 
                   'category_Clickteam', 'category_Beyond', 'category_Wrath', 'category_Anti', 
                   'category_Cheat', 'category_Special', 'category_In', 'category_Dungeon', 
                   'category_Roller', 'category_Lethal', 'category_Fisherman']

# Filter only existing columns
columns_to_drop = [col for col in columns_to_drop if col in steam_data_cleaned.columns]
steam_data_cleaned = steam_data_cleaned.drop(columns=columns_to_drop)

# Include all categories in the feature set
category_columns = [f'category_{category}' for category in unique_categories if f'category_{category}' in steam_data_cleaned.columns]

In [3]:
# Prepare features and labels
X_parts = [
    steam_data_cleaned[['price_parsed']].to_numpy(),  # Include price
    steam_data_cleaned[category_columns].to_numpy(),  # Include all category columns
    developer_dummies.to_numpy(),  # Include developer dummies
    publisher_dummies.to_numpy()  # Include publisher dummies
]
X = np.hstack(X_parts)
y = steam_data_cleaned['score_percentage'].to_numpy()

# Remove rows where y is NaN
valid_indices = ~np.isnan(y)
X = X[valid_indices]
y = y[valid_indices]

# Impute missing values and standardize features
imputer = SimpleImputer(strategy='mean')
scaler = StandardScaler()
X_imputed = imputer.fit_transform(X)
X_scaled = scaler.fit_transform(X_imputed)

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=0)

In [None]:
# Hyperparameter tuning for Random Forest
rf_params = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
}
rf = RandomForestRegressor(random_state=0)
rf_grid = GridSearchCV(rf, rf_params, scoring='r2', cv=3, n_jobs=-1)
rf_grid.fit(X_train, y_train)

best_rf = rf_grid.best_estimator_
y_pred_rf = best_rf.predict(X_test)
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)
print(f"Random Forest - Best Params: {rf_grid.best_params_}")
print(f"MSE: {mse_rf:.2f}, R²: {r2_rf:.2f}")

In [None]:
# Hyperparameter tuning for XGBoost
xgb_params = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0]
}
xgb = XGBRegressor(random_state=0)
xgb_grid = GridSearchCV(xgb, xgb_params, scoring='r2', cv=3, n_jobs=-1)
xgb_grid.fit(X_train, y_train)

best_xgb = xgb_grid.best_estimator_
y_pred_xgb = best_xgb.predict(X_test)
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)
print(f"XGBoost - Best Params: {xgb_grid.best_params_}")
print(f"MSE: {mse_xgb:.2f}, R²: {r2_xgb:.2f}")

In [None]:
# Train and evaluate Linear Regression model (no hyperparameters to tune)
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)
mse_lr = mean_squared_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)
print(f"Linear Regression - MSE: {mse_lr:.2f}, R²: {r2_lr:.2f}")

In [None]:
# Feature importance for Random Forest
feature_names = ['price_parsed'] + category_columns + list(developer_dummies.columns) + list(publisher_dummies.columns)
feature_importances_rf = pd.DataFrame({
    'Feature': feature_names,
    'Importance': best_rf.feature_importances_
}).sort_values(by='Importance', ascending=False)

# Plot feature importance
plt.figure(figsize=(25, 8))
sns.barplot(data=feature_importances_rf.head(25), x='Importance', y='Feature')
plt.title('Top 25 Most Important Features - Random Forest')
plt.xlabel('Feature Importance')
plt.ylabel('Feature')
plt.show()

In [None]:
# Feature importance for Random Forest
feature_names = ['price_parsed'] + category_columns + list(developer_dummies.columns) + list(publisher_dummies.columns)
feature_importances_xgb = pd.DataFrame({
    'Feature': feature_names,
    'Importance': best_xgb.feature_importances_
}).sort_values(by='Importance', ascending=False)

# Plot feature importance
plt.figure(figsize=(25, 8))
sns.barplot(data=feature_importances_xgb.head(25), x='Importance', y='Feature')
plt.title('Top 25 Most Important Features - Random Forest')
plt.xlabel('Feature Importance')
plt.ylabel('Feature')
plt.show()