In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install catboost
!pip install scikit-learn
!pip install dask_ml

Collecting catboost
  Downloading catboost-1.2.3-cp310-cp310-manylinux2014_x86_64.whl (98.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.5/98.5 MB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.3
Collecting dask_ml
  Downloading dask_ml-2024.3.20-py3-none-any.whl (148 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m148.9/148.9 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
Collecting dask-glm>=0.2.0 (from dask_ml)
  Downloading dask_glm-0.3.2-py2.py3-none-any.whl (13 kB)
Collecting sparse>=0.7.0 (from dask-glm>=0.2.0->dask_ml)
  Downloading sparse-0.15.1-py2.py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: sparse, dask-glm, dask_ml
Successfully installed dask-glm-0.3.2 dask_ml-2024.3.20 sparse-0.15.1


### **1. IMPORT LIBRARY**

In [None]:
# Third-party library imports
import pandas as pd
import numpy as np
import dask.dataframe as dd
from dask_ml.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from statistics import mean
from sklearn.metrics import r2_score

# Third-party library models imports
from catboost import CatBoostRegressor
import xgboost as xgb
import lightgbm as lgbm
from sklearn.linear_model import Ridge

### **2. ACQUIRE DATA**
- **Note**: D:\Team EcoByte Data-Driven\ to your directory

In [None]:
file_path = r"/content/drive/MyDrive/Data/"

In [None]:
data = dd.read_csv(file_path + 'ProcessedData.csv')

In [None]:
f = open(file_path + 'Accuracy.txt', "a")

In [None]:
test_case = "Test_run_101"

### **3. CLASS**

In [None]:
class EarlyStoppingRegressor(BaseEstimator, RegressorMixin):
    def __init__(self, estimator, early_stopping_rounds=2, train_size = 0, sample_size = 500000, **kwargs):
        self.estimator = estimator
        self.early_stopping_rounds = early_stopping_rounds
        self.estimators_ = []
        self.score = []
        self.train_size = train_size
        self.sample_size = sample_size
    def fit(self, X, y, X_val, y_val):
        self.estimators_ = []
        best_score = float("inf")
        early_stopping_count = 0
        for i in range(0, self.train_size, self.sample_size):
          X_sample, y_sample = split_sample_size(X, y, i, self.sample_size)
          for i in range(3):
            estimator = clone(self.estimator)
            estimator.fit(X_sample, y_sample)
            self.estimators_.append(estimator)
            val_error = estimator.score(X_val, y_val)
            if val_error < best_score:
                best_score = val_error
                early_stopping_count = 0
            else:
                early_stopping_count += 1
            if early_stopping_count >= self.early_stopping_rounds:
                break
        return self
    def predict(self, X):
        return self.estimators_[-1].predict(X)
    def get_feature_importance(self, model):
      if model == 'linear':
        return self.estimators_[-1].coef_[0].round(2)
      else:
        if(sum(pipeline[1].feature_importances_.round(2)) != 1):
          return (self.estimators_[-1].feature_importances_.round(2)/sum(self.estimators_[-1].feature_importances_.round(2)).round(2)).round(2)
        else:
          return self.estimators_[-1].feature_importances_.round(2)
    def get_features(self, model):
      if model == 'linear':
        return self.estimators_[-1].feature_names_in_
      else:
        return self.estimators_[-1].feature_names_

### **3. FUNCTION**

In [None]:
def split_sample_size(X_train, y_train, i, batch_size):
  X_batch = X_train[i:i+batch_size]
  y_batch = y_train[i:i+batch_size]
  return X_batch, y_batch

### **4. MACHINE LEARNING**

In [None]:
features_importance = ['AGE', 'TENURE_IN_DAYS', 'FOB_RACING_TURNOVER', 'FOB_SPORT_TURNOVER', 'PARI_RACING_TURNOVER',
                  'PARI_SPORT_TURNOVER', 'DIVIDENDS_PAID', 'TICKETS', 'IS_WEEKEND', 'IS_WEEKDAY',
                  'IS_YEAR', 'IS_MONTH', 'SEASON_ENCODE', 'MALE', 'FEMALE', 'UNKNOWN', 'GENDER_AGE_BAND', 'IS_HOLIDAY',
                  'IS_WA', 'IS_OTH', 'IS_YEAR_2021', 'IS_YEAR_2022', 'CUSTOMER_STATUS', 'GROSS_MARGIN_STATUS',
                  'IS_PLAY_FOB_RACING', 'IS_PLAY_FOB', 'IS_PLAY_PARI_RACING', 'IS_WORLD_CUP', 'IS_BET_DURING_WORLD_CUP', 'AGE_BAND',
                  "SEGMENT_DAY_OF_WEEK", 'IS_SPRING', 'IS_SUMMER', 'IS_AUTUMN', 'IS_WINTER', 'IS_PLAY_SPORT', 'IS_PLAY_RACING' , 'IS_PLAY_BET',
                  'IS_PLAY_PARI_SPORT', 'IS_PLAY_PARI', 'IS_PLAY_FOB_SPORT', 'DAY_OF_WEEK', 'SEGMENT_GENDER_DAY_OF_WEEK_RESIDENTIAL',
                  'Segment']

In [None]:
target_feature = ['TOTAL_TURNOVER']

#### **STEP 1: SEPARATE IMPORTANT FEATURES AND TARGET LABEL**

In [None]:
X = data[features_importance]
y = data[target_feature]

#### **STEP 2: SPLIT DATA INTO TRAIN, VALIDATION AND TEST SETS**

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1607)



In [None]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=1607)

In [None]:
X_train = X_train.compute()
y_train = y_train.compute()

In [None]:
sample_size = 500000
train_size = len(X_train)

#### **STEP 3: DEFINE PIPELINE**

In [None]:
pipelines = [
    ('linear', EarlyStoppingRegressor(Ridge(alpha=0.1), train_size = train_size, sample_size = sample_size)),
    # ('lightgbm', EarlyStoppingRegressor(lgbm.LGBMRegressor(n_estimators = 100, learning_rate = 0.01, verbose=-1, objective='regression', metric='rmse', lambda_l2=0.1), train_size = train_size, sample_size = sample_size)),
    # ('xgboost', EarlyStoppingRegressor(xgb.XGBRegressor(n_estimators = 100, learning_rate = 0.01, verbosity=0, objective='reg:squarederror', reg_lambda=0.1),  train_size = train_size, sample_size = sample_size)),
    # ('catboost', EarlyStoppingRegressor(CatBoostRegressor(n_estimators=100, learning_rate = 0.01, verbose=False, l2_leaf_reg=3),  train_size = train_size, sample_size = sample_size))
]

#### **STEP 4: TRAIN AND EVALUATE EACH MODEL**

In [None]:
for name, pipeline in pipelines:
    print(f"Training {name}...")
    pipeline.fit(X_train, y_train, X_val.compute(), y_val.compute())

Training linear...


In [None]:
f.write(f"{test_case}" + '\n')
for name, pipeline in pipelines:
  score_batch = []
  for i in range(0, train_size, sample_size):
    X_batch, y_batch = split_sample_size(X_train, y_train, i, sample_size)
    score_batch.append(r2_score(y_batch, pipeline.predict(X_batch)))
  score_train = mean(score_batch)
  score_val = r2_score(y_val.compute(), pipeline.predict(X_val.compute()))
  score_test = r2_score(y_test.compute(), pipeline.predict(X_test.compute()))
  f.write(f"- {name} accuracy on train: {score_train:.2}\n")
  f.write(f"- {name} accuracy on validation: {score_val:.2}\n")
  f.write(f"- {name} accuracy on test: {score_test:.2}\n")
f.write('------------------------------------\n')
f.close()

#### **STEP 5: AQUIRE FEATURE IMPORTANCES**

In [None]:
feature_importance_df = pd.DataFrame(pipelines[0][1].get_features(pipelines[0][0]))

In [None]:
for name, pipeline in pipelines:
  feature_importance_df[name] = pipeline.get_feature_importance(name)

In [None]:
feature_importance_df.rename(columns={0 : 'Features'}, inplace=True)

In [None]:
feature_importance_df.to_csv(file_path + f'{test_case}.csv', index=False)

#### **STEP 6: CALCULATE P-VALUES**