In [29]:
!pip install -q pandas numpy scikit-learn scipy matplotlib seaborn gradio
!git --version


git version 2.34.1


In [20]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, accuracy_score, f1_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import joblib
import gradio as gr

In [3]:
#Load dataset
df = pd.read_csv("vgsales.csv")
print("Shape:", df.shape)
df.head()

Shape: (16598, 11)


Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,1,Wii Sports,Wii,2006.0,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74
1,2,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24
2,3,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82
3,4,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.75,11.01,3.28,2.96,33.0
4,5,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37


In [4]:
# Inspect and basic cleaning
print(df.info())
print("\nMissing values per column:\n", df.isna().sum())

# Missing rows dropped
df = df.dropna(subset=['Global_Sales']).copy()

# imputing floating values with median
df['Year'] = pd.to_numeric(df['Year'], errors='coerce')

# Replacing years that are heavy
df.loc[(df['Year'] < 1970) | (df['Year'] > 2050), 'Year'] = np.nan

# Filling unknown pulisher
df['Publisher'] = df['Publisher'].fillna('Unknown')

#regional sales
df['Total_Regional_Sales'] = df[['NA_Sales','EU_Sales','JP_Sales','Other_Sales']].sum(axis=1)

# Sales ratio features
df['NA_ratio'] = df['NA_Sales'] / (df['Global_Sales'] + 1e-6)
df['EU_ratio'] = df['EU_Sales'] / (df['Global_Sales'] + 1e-6)
df['JP_ratio'] = df['JP_Sales'] / (df['Global_Sales'] + 1e-6)
df['Other_ratio'] = df['Other_Sales'] / (df['Global_Sales'] + 1e-6)

# 3) Outlier clipping
for col in ['NA_Sales','EU_Sales','JP_Sales','Other_Sales','Global_Sales','Total_Regional_Sales']:
    lower = df[col].quantile(0.01)
    upper = df[col].quantile(0.99)
    df[col] = df[col].clip(lower, upper)

print("After preprocessing:")
print(df.describe(include='all').transpose().head(20))


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16598 entries, 0 to 16597
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Rank          16598 non-null  int64  
 1   Name          16598 non-null  object 
 2   Platform      16598 non-null  object 
 3   Year          16327 non-null  float64
 4   Genre         16598 non-null  object 
 5   Publisher     16540 non-null  object 
 6   NA_Sales      16598 non-null  float64
 7   EU_Sales      16598 non-null  float64
 8   JP_Sales      16598 non-null  float64
 9   Other_Sales   16598 non-null  float64
 10  Global_Sales  16598 non-null  float64
dtypes: float64(6), int64(1), object(4)
memory usage: 1.4+ MB
None

Missing values per column:
 Rank              0
Name              0
Platform          0
Year            271
Genre             0
Publisher        58
NA_Sales          0
EU_Sales          0
JP_Sales          0
Other_Sales       0
Global_Sales      0
dtype: int64
Aft

In [5]:
feature_cols = [
    'Platform','Year','Genre','Publisher',
    'NA_Sales','EU_Sales','JP_Sales','Other_Sales',
    'Total_Regional_Sales','NA_ratio','EU_ratio','JP_ratio','Other_ratio'
]

X = df[feature_cols].copy()
y = df['Global_Sales'].copy()

numeric_features = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = X.select_dtypes(exclude=[np.number]).columns.tolist()


In [7]:
#Creating Pipeline

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

model = RandomForestRegressor(n_estimators=300, random_state=42)

pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('clf', model)])


In [9]:
#Trinning and splitting

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

pipeline.fit(X_train, y_train)


In [22]:
# Cross-Validation and Hyperparameter tunning using RandomizedSearchCV

param_dist = {
    'clf__n_estimators': [100, 300, 500, 800],
    'clf__max_depth': [None, 10, 20, 30],
    'clf__min_samples_split': [2, 5, 10],
    'clf__min_samples_leaf': [1, 2, 4],
    'clf__max_features': ['sqrt', 'log2', None]
}

random_search_cv = RandomizedSearchCV(
    pipeline,
    param_distributions=param_dist,
    n_iter=5,
    cv=3,
    scoring='neg_root_mean_squared_error',
    random_state=42,
    n_jobs=-1
)

random_search_cv.fit(X_train, y_train)

print("Best parameters:", random_search_cv.best_params_)
print(f"Best CV RMSE: {-random_search_cv.best_score_:.4f}")

best_model = random_search_cv.best_estimator_


Best parameters: {'clf__n_estimators': 100, 'clf__min_samples_split': 5, 'clf__min_samples_leaf': 4, 'clf__max_features': None, 'clf__max_depth': 30}
Best CV RMSE: 0.0050


In [23]:
#Task 8: Save Best Model
joblib.dump(best_model, "best_model.pkl")

print("Best model saved to best_model.pkl")

Best model saved to best_model.pkl


In [26]:
# Evaluate on Test Set

y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Test RMSE: {rmse:.4f}")
print(f"Test MAE: {mae:.4f}")
print(f"Test R2: {r2:.4f}")

Test RMSE: 0.0042
Test MAE: 0.0022
Test R2: 1.0000


In [28]:
def predict_sales(Platform, Year, Genre, Publisher,
                  NA_Sales, EU_Sales, JP_Sales, Other_Sales):


    total_sales = NA_Sales + EU_Sales + JP_Sales + Other_Sales

    X_user = pd.DataFrame([{
        "Platform": Platform,
        "Year": Year,
        "Genre": Genre,
        "Publisher": Publisher,
        "NA_Sales": NA_Sales,
        "EU_Sales": EU_Sales,
        "JP_Sales": JP_Sales,
        "Other_Sales": Other_Sales,
        "Total_Regional_Sales": total_sales,
        "NA_ratio": NA_Sales / (total_sales + 1e-6),
        "EU_ratio": EU_Sales / (total_sales + 1e-6),
        "JP_ratio": JP_Sales / (total_sales + 1e-6),
        "Other_ratio": Other_Sales / (total_sales + 1e-6)
    }])

    # Prediction
    pred = float(best_model.predict(X_user)[0])
    return round(pred, 4)

# Simple Gradio interface
demo = gr.Interface(
    fn=predict_sales,
    inputs=[
        gr.Textbox(label="Platform"),
        gr.Number(label="Year"),
        gr.Textbox(label="Genre"),
        gr.Textbox(label="Publisher"),
        gr.Number(label="NA_Sales"),
        gr.Number(label="EU_Sales"),
        gr.Number(label="JP_Sales"),
        gr.Number(label="Other_Sales"),
    ],
    outputs="number",
    title="Video Game Global Sales Prediction",
    description="Enter game details to predict Global Sales (millions)."
)

demo.launch()


It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://45ccedd80e24fdc6e3.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [30]:
!apt-get install git

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
git is already the newest version (1:2.34.1-1ubuntu1.15).
0 upgraded, 0 newly installed, 0 to remove and 1 not upgraded.


In [31]:
!git --version


git version 2.34.1


In [None]:
!git config --global user.name "AZIDAHAKA11"
!git config --global user.email "imafnanzarif@gmail.com"


In [32]:
cd path/to/your/project

[Errno 2] No such file or directory: 'path/to/your/project'
/content
