In [49]:
# 1. Import Required Libraries

import pandas as pd
import numpy as np

# Model & preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

# Evaluation metrics
from sklearn.metrics import mean_squared_error, r2_score


In [50]:
# 2. Load Dataset
df = pd.read_csv("vgsales.csv")
print("Dataset Shape:", df.shape)
df.head()

Dataset Shape: (16598, 11)


Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,1,Wii Sports,Wii,2006.0,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74
1,2,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24
2,3,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82
3,4,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.75,11.01,3.28,2.96,33.0
4,5,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37


In [51]:
# 3. Data Cleaning
# Convert Year to numeric (in case of errors)
df["Year"] = pd.to_numeric(df["Year"], errors="coerce")

# Fill missing Year with median
df["Year"] = df["Year"].fillna(df["Year"].median())

# Fill missing Publisher with placeholder
df["Publisher"] = df["Publisher"].fillna("Unknown")

# Check missing values again
df.isnull().sum()

Unnamed: 0,0
Rank,0
Name,0
Platform,0
Year,0
Genre,0
Publisher,0
NA_Sales,0
EU_Sales,0
JP_Sales,0
Other_Sales,0


In [52]:
# 4. Feature Selection
# We are predicting Global Sales (Regression Problem)
y = df["Global_Sales"]

# Using non-leakage features only
X = df[["Year", "Platform", "Genre", "Publisher"]]

X.head()

Unnamed: 0,Year,Platform,Genre,Publisher
0,2006.0,Wii,Sports,Nintendo
1,1985.0,NES,Platform,Nintendo
2,2008.0,Wii,Racing,Nintendo
3,2009.0,Wii,Sports,Nintendo
4,1996.0,GB,Role-Playing,Nintendo


In [53]:
# 5. One-Hot Encoding
X = pd.get_dummies(X, columns=["Platform", "Genre", "Publisher"], drop_first=True)

# Convert boolean to int (optional but clean)
X = X.astype(int)

print("Feature shape after encoding:", X.shape)
X.head()

Feature shape after encoding: (16598, 619)


Unnamed: 0,Year,Platform_3DO,Platform_3DS,Platform_DC,Platform_DS,Platform_GB,Platform_GBA,Platform_GC,Platform_GEN,Platform_GG,...,Publisher_Zushi Games,Publisher_bitComposer Games,Publisher_dramatic create,Publisher_fonfun,Publisher_iWin,Publisher_id Software,Publisher_imageepoch Inc.,Publisher_inXile Entertainment,"Publisher_mixi, Inc",Publisher_responDESIGN
0,2006,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1985,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2008,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2009,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1996,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [54]:
# 6. Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)

print("Training size:", X_train.shape)
print("Testing size:", X_test.shape)

Training size: (13278, 619)
Testing size: (3320, 619)


In [55]:
# 7. Linear Regression Model

lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred_lr = lr.predict(X_test)

# Evaluation
rmse_lr = np.sqrt(mean_squared_error(y_test, y_pred_lr))
r2_lr = r2_score(y_test, y_pred_lr)

print("Linear Regression Results")
print("RMSE:", rmse_lr)
print("R2:", r2_lr)


Linear Regression Results
RMSE: 1.9562145943126836
R2: 0.08915876832997405


In [56]:
# 8. Decision Tree Regressor

dt = DecisionTreeRegressor(random_state=42)
dt.fit(X_train, y_train)

y_pred_dt = dt.predict(X_test)

rmse_dt = np.sqrt(mean_squared_error(y_test, y_pred_dt))
r2_dt = r2_score(y_test, y_pred_dt)

print("Decision Tree Results")
print("RMSE:", rmse_dt)
print("R2:", r2_dt)

Decision Tree Results
RMSE: 2.1951822517985904
R2: -0.1469668292547226


In [57]:
# 9. Random Forest Regressor

rf = RandomForestRegressor(random_state=42)
rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)

rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))
r2_rf = r2_score(y_test, y_pred_rf)

print("Random Forest Results")
print("RMSE:", rmse_rf)
print("R2:", r2_rf)

Random Forest Results
RMSE: 2.0404460783835034
R2: 0.009031299804074577


In [58]:
# 10. Model Comparison

results = pd.DataFrame({
    "Model": ["Linear Regression", "Decision Tree", "Random Forest"],
    "RMSE": [rmse_lr, rmse_dt, rmse_rf],
    "R2 Score": [r2_lr, r2_dt, r2_rf]
})

results.sort_values("RMSE")

Unnamed: 0,Model,RMSE,R2 Score
0,Linear Regression,1.956215,0.089159
2,Random Forest,2.040446,0.009031
1,Decision Tree,2.195182,-0.146967


Linear Regression performs slightly better than Tree-based models.

R² scores are low, indicating weak predictive power.

Dataset lacks strong predictive features for Global Sales.

More feature engineering is required for improvement.