In [15]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
import pickle

# Load the dataset
file_path = '/kaggle/input/80000-steam-games-dataset/steam_data.csv'
steam_data = pd.read_csv(file_path)

In [16]:
# Define function to parse price column
def parse_price_strict(price):
    if 'free' in str(price).lower():  # Handle "Free" explicitly
        return 0.0
    match = re.search(r"\d+(\.\d+)?", str(price))  # Match numeric part of price (decimal allowed)
    return float(match.group()) if match else None  # Return as float, or None if no match

# Define function to extract score percentage from user_reviews
def parse_score_percentage(user_reviews):
    match = re.search(r"(\d{1,2})%", str(user_reviews))  # Match 1-2 digit number preceding '%'
    return int(match.group(1)) if match else None  # Return as integer, or None if no match

# Define function to extract sentiment from user_reviews
def parse_review_sentiment(user_reviews):
    sentiment_match = re.search(r"^(.*?)\(", str(user_reviews))  # Extract sentiment before '('
    return sentiment_match.group(1).strip() if sentiment_match else None  # Return sentiment or None

# Apply parsing functions to relevant columns
steam_data['price_parsed'] = steam_data['price'].apply(parse_price_strict)
steam_data['score_percentage'] = steam_data['user_reviews'].apply(parse_score_percentage)
steam_data['review_sentiment'] = steam_data['user_reviews'].apply(parse_review_sentiment)

# Drop rows with missing values in key columns
steam_data_cleaned = steam_data.dropna(subset=['price_parsed', 'score_percentage'])

# Display cleaned data summary (optional)
print(steam_data_cleaned[['price', 'price_parsed', 'user_reviews', 'score_percentage', 'review_sentiment']].head())

                                               price  price_parsed  \
0                       Buy Among Us$4.99Add to Cart          4.99   
1  Play Counter-Strike: Global OffensiveFree to P...          0.00   
2                     Buy Fall Guys$19.99Add to Cart         19.99   
3            Buy Crusader Kings III$49.99Add to Cart         49.99   
4                Play Destiny 2Free To PlayPlay Game          0.00   

                                        user_reviews  score_percentage  \
0  Overwhelmingly Positive(151,281)- 96% of the 1...              96.0   
1  Very Positive(90,780)- 88% of the 90,780 user ...              88.0   
2  Very Positive(32,436)- 84% of the 32,436 user ...              84.0   
3  Very Positive(5,359)- 91% of the 5,359 user re...              91.0   
4  Very Positive(9,147)- 87% of the 9,147 user re...              87.0   

          review_sentiment  
0  Overwhelmingly Positive  
1            Very Positive  
2            Very Positive  
3            Very 

In [17]:
# Prepare the data for modeling
# Extract relevant columns for analysis
X = steam_data_cleaned[['price_parsed', 'review_sentiment']]  # Include other features as needed
y = steam_data_cleaned['score_percentage']

# Convert categorical columns (like sentiment) to numeric using one-hot encoding
X = pd.get_dummies(X, columns=['review_sentiment'], drop_first=True)

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Train the ensemble model (Gradient Boosting Regressor)
gb_model = GradientBoostingRegressor(learning_rate=0.1, max_depth=4, min_samples_leaf=2, min_samples_split=2, n_estimators=100, subsample=0.8, random_state=0)
gb_model.fit(X_train, y_train)

# Evaluate the ensemble model
y_pred = gb_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error (Gradient Boosting):", mse)

# Save model and clean data for reuse if necessary
with open('steam_gb_model.pkl', 'wb') as file:
    pickle.dump(gb_model, file)
steam_data_cleaned.to_csv('cleaned_steam_data.csv', index=False)

Mean Squared Error (Gradient Boosting): 450.49368958775455


In [18]:
# Save model and clean data for reuse if necessary
with open('steam_gb_model.pkl', 'wb') as file:
    pickle.dump(gb_model, file)
steam_data_cleaned.to_csv('cleaned_steam_data.csv', index=False)