In [29]:
import pandas as pd
import re

# Load the dataset
file_path = '/kaggle/input/80000-steam-games-dataset/steam_data.csv'  # Replace with the correct file path
steam_data = pd.read_csv(file_path)

steam_data.head()

Unnamed: 0,url,name,categories,img_url,user_reviews,all_reviews,date,developer,publisher,price,pegi,pegi_url
0,https://store.steampowered.com/app/945360/Amon...,Among Us,Online PvPLAN PvPOnline Co-opLAN Co-opCross-Pl...,https://steamcdn-a.akamaihd.net/steam/apps/945...,"Overwhelmingly Positive(151,281)- 96% of the 1...","Overwhelmingly Positive(224,878)- 95% of the 2...","Nov 16, 2018",Innersloth,Innersloth,Buy Among Us$4.99Add to Cart,-,-
1,https://store.steampowered.com/app/730/Counter...,Counter-Strike: Global Offensive,Steam AchievementsFull controller supportSteam...,https://steamcdn-a.akamaihd.net/steam/apps/730...,"Very Positive(90,780)- 88% of the 90,780 user ...","Very Positive(4,843,904)- 87% of the 4,843,904...","Aug 21, 2012","Valve, Hidden Path Entertainment",Valve,Play Counter-Strike: Global OffensiveFree to P...,-,-
2,https://store.steampowered.com/app/1097150/Fal...,Fall Guys: Ultimate Knockout,MMOOnline PvPOnline Co-opSteam AchievementsFul...,https://steamcdn-a.akamaihd.net/steam/apps/109...,"Very Positive(32,436)- 84% of the 32,436 user ...","Very Positive(223,706)- 80% of the 223,706 use...","Aug 3, 2020",Mediatonic,Devolver Digital,Buy Fall Guys$19.99Add to Cart,-,-
3,https://store.steampowered.com/app/1158310/Cru...,Crusader Kings III,Single-playerOnline PvPSteam AchievementsSteam...,https://steamcdn-a.akamaihd.net/steam/apps/115...,"Very Positive(5,359)- 91% of the 5,359 user re...","Very Positive(18,951)- 92% of the 18,951 user ...","Sep 1, 2020",Paradox Development Studio,Paradox Interactive,Buy Crusader Kings III$49.99Add to Cart,BloodLanguagePartial NuditySexual ThemesUse of...,https://steamstore-a.akamaihd.net/public/share...
4,https://store.steampowered.com/app/1085660/Des...,Destiny 2,Single-playerOnline PvPOnline Co-opSteam Achie...,https://steamcdn-a.akamaihd.net/steam/apps/108...,"Very Positive(9,147)- 87% of the 9,147 user re...","Very Positive(284,689)- 86% of the 284,689 use...","Oct 1, 2019",Bungie,Bungie,Play Destiny 2Free To PlayPlay Game,Blood Language Violence,https://steamstore-a.akamaihd.net/public/share...


In [30]:
# Define function to parse price column
def parse_price_strict(price):
    if 'free' in price.lower():  # Handle "Free" explicitly
        return 0.0
    match = re.search(r"\d+(\.\d+)?", str(price))  # Match numeric part of price (decimal allowed)
    return float(match.group()) if match else None  # Return as float, or None if no match

# Define function to extract score percentage from user_reviews
def parse_score_percentage(user_reviews):
    match = re.search(r"(\d{1,2})%", str(user_reviews))  # Match 1-2 digit number preceding '%'
    return int(match.group(1)) if match else None  # Return as integer, or None if no match

# Define function to extract sentiment from user_reviews
def parse_review_sentiment(user_reviews):
    sentiment_match = re.search(r"^(.*?)\(", str(user_reviews))  # Extract sentiment before '('
    return sentiment_match.group(1).strip() if sentiment_match else None  # Return sentiment or None

# Apply parsing functions to relevant columns
steam_data['price_parsed'] = steam_data['price'].apply(parse_price_strict)
steam_data['score_percentage'] = steam_data['user_reviews'].apply(parse_score_percentage)
steam_data['review_sentiment'] = steam_data['user_reviews'].apply(parse_review_sentiment)

# Drop the original 'genres' column if it is no longer needed
steam_data_cleaned = steam_data_cleaned.dropna(subset=['price_parsed', 'score_percentage'])

# Display cleaned data summary (optional)
print(steam_data_cleaned[['price', 'price_parsed', 'user_reviews', 'score_percentage', 'review_sentiment']].head())

                                               price  price_parsed  \
0                       Buy Among Us$4.99Add to Cart          4.99   
1  Play Counter-Strike: Global OffensiveFree to P...          0.00   
2                     Buy Fall Guys$19.99Add to Cart         19.99   
3            Buy Crusader Kings III$49.99Add to Cart         49.99   
4                Play Destiny 2Free To PlayPlay Game          0.00   

                                        user_reviews  score_percentage  \
0  Overwhelmingly Positive(151,281)- 96% of the 1...              96.0   
1  Very Positive(90,780)- 88% of the 90,780 user ...              88.0   
2  Very Positive(32,436)- 84% of the 32,436 user ...              84.0   
3  Very Positive(5,359)- 91% of the 5,359 user re...              91.0   
4  Very Positive(9,147)- 87% of the 9,147 user re...              87.0   

          review_sentiment  
0  Overwhelmingly Positive  
1            Very Positive  
2            Very Positive  
3            Very 

In [31]:
# Prepare the data for modeling
# Extract relevant columns for analysis
X = steam_data_cleaned[['price_parsed', 'review_sentiment']]  # Include other features as needed
y = steam_data_cleaned['score_percentage']

# Convert categorical columns (like sentiment) to numeric using one-hot encoding or similar
X = pd.get_dummies(X, columns=['review_sentiment'], drop_first=True)

In [32]:
# Example: Simple linear regression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model = LinearRegression()
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

Mean Squared Error: 450.6185995945769


In [None]:
# Save model and clean data for reuse if necessary
import pickle
with open('steam_model.pkl', 'wb') as file:
    pickle.dump(model, file)
steam_data_cleaned.to_csv('cleaned_steam_data.csv', index=False)