In [1]:
import pandas as pd
import numpy as np
import re
from datetime import datetime
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OrdinalEncoder
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import VotingRegressor

In [2]:
# Load Data (Replace with your actual file paths)
train = pd.read_csv('/kaggle/input/predicting-mobile-game-success/predicting_mobile_game_success_train_set.csv')
test = pd.read_csv('/kaggle/input/test-set/samples_mobile_game_success_test_set.csv')

In [3]:
# --- 1. PREPROCESSING & FEATURE ENGINEERING FUNCTION ---
def process_data(df):
    df = df.copy()
    
    # --- A. Date Management ---
    # Convert to datetime
    df['Original Release Date'] = pd.to_datetime(df['Original Release Date'], format='%d/%m/%Y', errors='coerce')
    df['Current Version Release Date'] = pd.to_datetime(df['Current Version Release Date'], format='%d/%m/%Y', errors='coerce')
    
    # Create "Age" features (Reference date: use max date in dataset or today)
    ref_date = datetime.now()
    df['App_Age_Days'] = (ref_date - df['Original Release Date']).dt.days
    df['Days_Since_Update'] = (ref_date - df['Current Version Release Date']).dt.days
    df['Update_Lag'] = (df['Current Version Release Date'] - df['Original Release Date']).dt.days
    
    # --- B. List & String Parsing ---
    
    # Languages: Count number of languages supported
    df['Lang_Count'] = df['Languages'].apply(lambda x: len(str(x).split(',')) if pd.notnull(x) else 0)
    
    # Genres: Count number of genres
    df['Genre_Count'] = df['Genres'].apply(lambda x: len(str(x).split(',')) if pd.notnull(x) else 0)
    
    # In-app Purchases: Extract Min, Max, and Mean prices
    def extract_iap_stats(iap_str):
        if pd.isnull(iap_str) or iap_str == []:
            return 0, 0, 0, 0
        try:
            # Assuming format is like "1.99, 2.99, 0.99"
            prices = [float(p.strip()) for p in str(iap_str).split(',')]
            return len(prices), np.mean(prices), np.max(prices), np.min(prices)
        except:
            return 0, 0, 0, 0

    iap_stats = df['In-app Purchases'].apply(extract_iap_stats)
    df['IAP_Count'] = iap_stats.apply(lambda x: x[0])
    df['IAP_Mean'] = iap_stats.apply(lambda x: x[1])
    df['IAP_Max'] = iap_stats.apply(lambda x: x[2])
    
    # --- C. Text Meta-Features ---
    # Length of Description, Name, Subtitle (Longer descriptions often mean better SEO)
    df['Desc_Len'] = df['Description'].fillna('').apply(len)
    df['Name_Len'] = df['Name'].fillna('').apply(len)
    df['Subtitle_Len'] = df['Subtitle'].fillna('').apply(len)
    
    # --- D. Numerical Transforms ---
    # Size is usually huge (bytes), log transform normalizes it
    df['Size_Log'] = np.log1p(df['Size'])
    
    # --- E. Categorical Encoding ---
    # Simplified Age Rating (extract number)
    df['Age_Rating_Num'] = df['Age Rating'].str.extract('(\d+)').astype(float)
    
    # Factorize Primary Genre (Turn string into ID)
    df['Primary_Genre_Code'] = pd.factorize(df['Primary Genre'])[0]
    
    # Drop unused or raw columns
    drop_cols = ['ID', 'Name', 'Subtitle', 'In-app Purchases', 'Description', 
                 'Developer', 'Languages', 'Size', 'Genres', 'Original Release Date', 
                 'Current Version Release Date', 'Age Rating', 'Primary Genre', 'URL', 'Icon URL','Unnamed: 18']
    
    # Keep only columns that exist (to avoid errors if cols are missing)
    cols_to_drop = [c for c in drop_cols if c in df.columns]
    df = df.drop(columns=cols_to_drop)

    

    
    return df






  df['Age_Rating_Num'] = df['Age Rating'].str.extract('(\d+)').astype(float)


In [6]:
train.columns

Index(['URL', 'ID', 'Name', 'Subtitle', 'Icon URL', 'Average User Rating',
       'User Rating Count', 'Price', 'In-app Purchases', 'Description',
       'Developer', 'Age Rating', 'Languages', 'Size', 'Primary Genre',
       'Genres', 'Original Release Date', 'Current Version Release Date',
       'Unnamed: 18'],
      dtype='object')

In [4]:
# --- 2. EXECUTION ---
train= train.drop_duplicates()
df_train = train.copy()
df_train = df_train[df_train['Average User Rating'].notna()]
df_test = test.copy()

# Separate Target
target = 'Average User Rating'
y = df_train[target]
X = df_train.drop(columns=[target])

# Process Data
X_processed = process_data(X)
test_processed = process_data(df_test)


In [9]:
X_processed.shape, y.shape

((6453, 16), (6453,))

In [10]:
X_processed.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6453 entries, 0 to 13597
Data columns (total 16 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   User Rating Count   6453 non-null   float64
 1   Price               6453 non-null   float64
 2   App_Age_Days        6453 non-null   int64  
 3   Days_Since_Update   6453 non-null   int64  
 4   Update_Lag          6453 non-null   int64  
 5   Lang_Count          6453 non-null   int64  
 6   Genre_Count         6453 non-null   int64  
 7   IAP_Count           6453 non-null   int64  
 8   IAP_Mean            6453 non-null   float64
 9   IAP_Max             6453 non-null   float64
 10  Desc_Len            6453 non-null   int64  
 11  Name_Len            6453 non-null   int64  
 12  Subtitle_Len        6453 non-null   int64  
 13  Size_Log            6453 non-null   float64
 14  Age_Rating_Num      6453 non-null   float64
 15  Primary_Genre_Code  6453 non-null   int64  
dtypes: float64

In [16]:
# fine tuning parameters for better performance

import optuna
import re
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score



# Basic Preprocessing
def simple_process(df):
    df['Size_Log'] = np.log1p(df['Size'])
    df['Desc_Len'] = df['Description'].fillna('').apply(len)
    df['Lang_Count'] = df['Languages'].apply(lambda x: len(str(x).split(',')) if pd.notnull(x) else 0)
    
    # Date Engineering
    df['Original Release Date'] = pd.to_datetime(df['Original Release Date'], format='%d/%m/%Y', errors='coerce')
    df['Current Version Release Date'] = pd.to_datetime(df['Current Version Release Date'], format='%d/%m/%Y', errors='coerce')
    ref_date = pd.to_datetime('today')
    df['Days_Since_Update'] = (ref_date - df['Current Version Release Date']).dt.days
    df['App_Age_Days'] = (ref_date - df['Original Release Date']).dt.days
    
    # Select only numeric
    df = df.select_dtypes(include=['number', 'bool'])
    df = df.rename(columns = lambda x: re.sub('[^A-Za-z0-9_]+', '', x)) # Clean names
    return df.fillna(0)

X = simple_process(df_train.drop(columns=['Average User Rating']))
y = df_train['Average User Rating']

# --- 2. OPTUNA OPTIMIZATION ---
def objective(trial):
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 500, 2000),
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.1),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
        'n_jobs': -1,
        'random_state': 42
    }
    
    model = XGBRegressor(**param)
    scores = cross_val_score(model, X, y, cv=3, scoring='neg_root_mean_squared_error')
    return -scores.mean()

print("Starting Optimization... (This might take 5-10 mins)")
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=30)

print("\n------------------------------------------------")
print("ðŸŽ‰ BEST RMSE FOUND:", study.best_value)
print("ðŸ‘‰ COPY THESE PARAMS FOR SCRIPT 3:")
print(study.best_params)
print("------------------------------------------------")

[I 2026-02-12 22:31:46,592] A new study created in memory with name: no-name-e5dd647b-d345-41b2-85b3-9a86c1cd70ec


Starting Optimization... (This might take 5-10 mins)


[I 2026-02-12 22:31:49,465] Trial 0 finished with value: 0.825884544727197 and parameters: {'n_estimators': 1327, 'learning_rate': 0.07146672552192151, 'max_depth': 4, 'subsample': 0.856941526529625, 'colsample_bytree': 0.8272619625768358, 'reg_alpha': 0.028247548069526042, 'reg_lambda': 3.293614017870211e-08}. Best is trial 0 with value: 0.825884544727197.
[I 2026-02-12 22:32:01,393] Trial 1 finished with value: 0.7698001650142107 and parameters: {'n_estimators': 918, 'learning_rate': 0.09201683503152455, 'max_depth': 10, 'subsample': 0.7175874000277433, 'colsample_bytree': 0.7704577830123465, 'reg_alpha': 2.8662580037508315e-08, 'reg_lambda': 1.505688331889684}. Best is trial 1 with value: 0.7698001650142107.
[I 2026-02-12 22:32:10,189] Trial 2 finished with value: 0.7832749955374924 and parameters: {'n_estimators': 691, 'learning_rate': 0.08552195261158697, 'max_depth': 9, 'subsample': 0.8261557445004611, 'colsample_bytree': 0.8118844754695834, 'reg_alpha': 0.021406661641401727, 're


------------------------------------------------
ðŸŽ‰ BEST RMSE FOUND: 0.7339744629937727
ðŸ‘‰ COPY THESE PARAMS FOR SCRIPT 3:
{'n_estimators': 584, 'learning_rate': 0.006671349652208579, 'max_depth': 7, 'subsample': 0.9748445200398398, 'colsample_bytree': 0.9237384669421154, 'reg_alpha': 6.360719721858602, 'reg_lambda': 1.836328077600074e-08}
------------------------------------------------


In [5]:
# --- 3. MODELING (Ensemble) ---

# Initialize Models
xgb = XGBRegressor(
    n_estimators=1000, 
    learning_rate=0.01, 
    max_depth=3, 
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)

lgbm = LGBMRegressor(
    n_estimators=1000,
    learning_rate=0.01,
    num_leaves=31,
    random_state=42,
    n_jobs=-1,
    verbose=-1
)

# Ensemble Voting Regressor (Averages the predictions of both)
ensemble = VotingRegressor([('xgb', xgb), ('lgbm', lgbm)])

In [6]:
# --- 4. VALIDATION ---
# Use Cross-Validation to get a realistic RMSE
scores = cross_val_score(ensemble, X_processed, y, scoring='neg_root_mean_squared_error', cv=5)
print(f"Average RMSE: {-scores.mean():.4f}")

# --- 5. FINAL TRAINING & SUBMISSION ---
ensemble.fit(X_processed, y)
preds = ensemble.predict(test_processed)

# Post-processing: Clip predictions to be between 0 and 5
preds = np.clip(preds, 0.0, 5.0)

# Create submission
submission = pd.DataFrame({'ID': df_test['ID'], 'Average User Rating': preds})
submission.to_csv('submission.csv', index=False)

Average RMSE: 0.7165


In [7]:
# random forest
from sklearn.ensemble import RandomForestRegressor


rf_model = RandomForestRegressor(
    n_estimators=1000,
    max_depth=10,
    random_state=42,
    n_jobs=-1
)

scores_rf = cross_val_score(rf_model, X_processed, y, scoring='neg_root_mean_squared_error', cv=5)
print(f"Random Forest Average RMSE: {-scores_rf.mean():.4f}")

rf_model.fit(X_processed, y)
rf_preds = rf_model.predict(test_processed)
rf_preds = np.clip(rf_preds, 0.0, 5.0)
submission_rf = pd.DataFrame({'ID': df_test['ID'], 'Average User Rating': rf_preds})
submission_rf.to_csv('submission_rf.csv', index=False)


Random Forest Average RMSE: 0.7146


In [8]:
# random forest
from sklearn.ensemble import RandomForestRegressor


rf_model = RandomForestRegressor(
    n_estimators=3000,
    max_depth=20,
    random_state=42,
    n_jobs=-1
)

scores_rf = cross_val_score(rf_model, X_processed, y, scoring='neg_root_mean_squared_error', cv=5)
print(f"Random Forest Average RMSE: {-scores_rf.mean():.4f}")

rf_model.fit(X_processed, y)
rf_preds = rf_model.predict(test_processed)
rf_preds = np.clip(rf_preds, 0.0, 5.0)
submission_rf = pd.DataFrame({'ID': df_test['ID'], 'Average User Rating': rf_preds})
submission_rf.to_csv('submission_rf.csv', index=False)


Random Forest Average RMSE: 0.7150


In [10]:
# decisin tree
from sklearn.tree import DecisionTreeRegressor
dt_model = DecisionTreeRegressor(
    max_depth=10,
    random_state=42
)
scores_dt = cross_val_score(dt_model, X_processed, y, scoring='neg_root_mean_squared_error', cv=5)
print(f"Decision Tree Average RMSE: {-scores_dt.mean():.4f}")

dt_model.fit(X_processed, y)
dt_preds = dt_model.predict(test_processed)
dt_preds = np.clip(dt_preds, 0.0, 5.0)
submission_dt = pd.DataFrame({'ID': df_test['ID'], 'Average User Rating': dt_preds})
submission_dt.to_csv('submission_dt.csv', index=False)

Decision Tree Average RMSE: 0.8329


In [12]:
# fix Nans in test set
test_processed = test_processed.fillna(0)

In [13]:
# SVR
from sklearn.svm import SVR
svr_model = SVR(kernel='rbf', C=1.0, epsilon=0.1)
scores_svr = cross_val_score(svr_model, X_processed, y, scoring='neg_root_mean_squared_error', cv=5)
print(f"SVR Average RMSE: {-scores_svr.mean():.4f}")
svr_model.fit(X_processed, y)
svr_preds = svr_model.predict(test_processed)
svr_preds = np.clip(svr_preds, 0.0, 5.0)
submission_svr = pd.DataFrame({'ID': df_test['ID'], 'Average User Rating': svr_preds})
submission_svr.to_csv('submission_svr.csv', index=False)

SVR Average RMSE: 0.7485


In [14]:
# linear regression
from sklearn.linear_model import LinearRegression
lr_model = LinearRegression()
scores_lr = cross_val_score(lr_model, X_processed, y, scoring='neg_root_mean_squared_error', cv=5)
print(f"Linear Regression Average RMSE: {-scores_lr.mean():.4f}")
lr_model.fit(X_processed, y)
lr_preds = lr_model.predict(test_processed)
lr_preds = np.clip(lr_preds, 0.0, 5.0)
submission_lr = pd.DataFrame({'ID': df_test['ID'], 'Average User Rating': lr_preds})
submission_lr.to_csv('submission_lr.csv', index=False)


Linear Regression Average RMSE: 0.7285


In [15]:
# polynomial regression
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=3, interaction_only=True, include_bias=False)
X_poly = poly.fit_transform(X_processed)
from sklearn.linear_model import LinearRegression
poly_model = LinearRegression()
scores_poly = cross_val_score(poly_model, X_poly, y, scoring='neg_root_mean_squared_error', cv=5)
print(f"Polynomial Regression Average RMSE: {-scores_poly.mean():.4f}")
poly_model.fit(X_poly, y)
test_poly = poly.transform(test_processed)
poly_preds = poly_model.predict(test_poly)
poly_preds = np.clip(poly_preds, 0.0, 5.0)
submission_poly = pd.DataFrame({'ID': df_test['ID'], 'Average User Rating': poly_preds})
submission_poly.to_csv('submission_poly.csv', index=False)

Polynomial Regression Average RMSE: 5.5078


In [16]:
# save modles in pickle format
import pickle
with open('xgb_model.pkl', 'wb') as f:
    pickle.dump(xgb, f)
with open('lgbm_model.pkl', 'wb') as f:
    pickle.dump(lgbm, f)
with open('rf_model.pkl', 'wb') as f:
    pickle.dump(rf_model, f)
with open('dt_model.pkl', 'wb') as f:
    pickle.dump(dt_model, f)
with open('svr_model.pkl', 'wb') as f:
    pickle.dump(svr_model, f)
with open('lr_model.pkl', 'wb') as f:
    pickle.dump(lr_model, f)
with open('poly_model.pkl', 'wb') as f:
    pickle.dump(poly_model, f)
