# Experiment in trying to predict a lottey number

Note: We know that is almost impossible BUT we can try to apply some Data Sciencey stuff and see what will happen

So... the plan is to use a quick model first like ARIMA and see the results and after that we will build a full scalle deep learning model!

In [1]:
# imports
import pandas as pd
import numpy as np
import pmdarima as pm
# import itertools
from pmdarima import auto_arima
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from src.utils import read_all_parquet_files
from src.model import Model

In [2]:
# dataset filename
DATASET_LOCAL_COPY = './static/data/'

# List of columns to forecast
COLUMNS_TO_FORECAST = ['Ball_1', 'Ball_2', 'Ball_3', 'Ball_4', 'Ball_5', 'Ball_Bonus']

In [14]:
# read in lottery games data 
all_games = read_all_parquet_files(DATASET_LOCAL_COPY)

# need to convert data 
all_games['Ball_1'] = all_games['Ball_1'].astype('float64')
all_games['Ball_2'] = all_games['Ball_2'].astype('float64')
all_games['Ball_3'] = all_games['Ball_3'].astype('float64')
all_games['Ball_4'] = all_games['Ball_4'].astype('float64')
all_games['Ball_5'] = all_games['Ball_5'].astype('float64')
all_games['Ball_Bonus'] = all_games['Ball_Bonus'].astype('float64')

# set index
all_games.set_index('Date', inplace=True)
all_games.index = pd.DatetimeIndex(all_games.index).to_period('3.5D')

In [15]:
all_games.describe(include='all')

Unnamed: 0,Ball_1,Ball_2,Ball_3,Ball_4,Ball_5,Ball_Bonus
count,599.0,599.0,599.0,599.0,599.0,599.0
mean,12.15025,23.277129,35.277129,47.193656,58.564274,13.310518
std,9.807723,11.853058,12.430543,11.949555,9.377023,7.712675
min,1.0,2.0,3.0,7.0,22.0,1.0
25%,4.0,15.0,26.0,39.0,54.0,6.0
50%,10.0,22.0,36.0,48.0,61.0,13.0
75%,18.0,31.0,44.0,57.0,66.0,20.0
max,52.0,58.0,64.0,68.0,69.0,26.0


## let us add additional features

In [16]:
# Calculate weights based on historical frequency
weights = {}
for column in COLUMNS_TO_FORECAST:
    number_counts = all_games[column].value_counts(normalize=True).to_dict()
    total_samples = len(all_games[column])
    column_weights = {number: count / total_samples for number, count in number_counts.items()}
    weights[column] = column_weights
    #all_games[f'{column}_weight'] = column_weights

In [17]:
t = weights.get('Ball_1')
t.get(3)

0.00010869534923258297

# Let us try to optimize our ARIMA model

The expected value range for the hyperparameters p, d, and q in an ARIMA model depends on the characteristics of your time series data. These hyperparameters determine the autoregressive (AR), differencing (I), and moving average (MA) components of the model, respectively. Here are some general guidelines for choosing values within these ranges:

p (AR order):

The autoregressive order (p) represents the number of lagged observations included in the model. It indicates how many past time points are used to predict the current value.
A typical range for p is 0 to 3. If the autocorrelation plot (ACF) shows significant values at lags 1, 2, or 3, you may consider these values for p.
Use the PACF plot to help determine the initial estimate for p.
d (I order):

The differencing order (d) indicates the number of times the time series needs to be differenced to achieve stationarity. It removes trends and seasonality from the data.
A common range for d is 0 to 2. If your data is already stationary, use d=0. If first-order differencing is required to make the data stationary, use d=1. In some cases, second-order differencing (d=2) may be necessary.
q (MA order):

The moving average order (q) represents the number of lagged forecast errors (residuals) included in the model. It captures the dependency on past forecast errors.
A typical range for q is 0 to 3. If the autocorrelation of the residuals (ACF of residuals) shows significant values at lags 1, 2, or 3, you may consider these values for q.
The ACF of residuals is used to determine the initial estimate for q.
These ranges are general guidelines and can vary depending on the specific characteristics of your time series. It's often a good practice to start with smaller values and incrementally increase them while monitoring the model's performance (e.g., using cross-validation or evaluation metrics) to avoid overfitting.

Keep in mind that the optimal values for p, d, and q may require experimentation, and automated tools like grid search or the auto_arima function in libraries such as pmdarima can help identify the best hyperparameters based on the data.

In [38]:
# Define the range of hyperparameters to search
sesonal = (True, False)
methods_list = {'newton','nm','bfgs','lbfgs'} #,'powell','cg','ncg','basinhopping'}
scoring_list = {'mse','mae'}

# 
best_model = None
best_order = None
best_mse = float('inf')

models_for_each_ball = {}

# Perform a grid search to find the best ARIMA model
for column in COLUMNS_TO_FORECAST:
    ts_data = all_games[column]

    for method in methods_list:
        for score in scoring_list:

            for season in sesonal:
                try:
                    # Use auto_arima to automatically select the best hyperparameters
                    model = auto_arima(ts_data,
                                    seasonal=season,
                                    trace=False,
                                    error_action='ignore', max_order=4,
                                    #method=method,
                                    suppress_warnings=True,
                                    order=best_order,
                                    model=best_model,
                                    out_of_sample_size=1,
                                    scoring=score)
                    best_order = model.get_params()['order']

                    # Fit the ARIMA model with the best hyperparameters
                    arima_model = ARIMA(ts_data, order=best_order)
                    #arima_fit = arima_model.fit()
                    # Fit the best model to the data
                    model_fit = arima_model.fit()
                    # Generate predictions
                    y_pred = model.predict(n_periods=len(ts_data))

                    # Calculate mean squared error (MSE) as the evaluation metric
                    mse = mean_squared_error(ts_data, y_pred)

                    if mse < best_mse:
                        # Create an instance of the Model class
                        my_model = Model()

                        # Set the name, model, and model_fit attributes
                        my_model.set_name(f'ARIMA Model for {column}')
                        my_model.set_model(model)
                        my_model.set_model_fit(model_fit)   
                        my_model.set_mse(mse)   
                        my_model.set_order(best_order) 

                        models_for_each_ball[column] = my_model

                except Exception as e:
                    print (f'Exception -> {e}')
                    continue

  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  ret

In [39]:
# List of trained ARIMA models
arima_models = []

# Loop through each column and fit ARIMA models
for column in COLUMNS_TO_FORECAST:
    ts_data = all_games[column]
    # get the best model from the previous step
    model = models_for_each_ball[column]
    arima_models.append(model)

    final_model = model.model
    final_model_fit = final_model.fit(ts_data, disp=0)

    # Forecast the next data point
    forecast = final_model_fit.predict(n_periods=len(ts_data),steps=1)

    print(f"Forecast for ('{column}'): {forecast[0]:.2f}")

  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(


Forecast for ('Ball_1'): 12.32
Forecast for ('Ball_2'): 23.28


  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(


Forecast for ('Ball_3'): 33.62
Forecast for ('Ball_4'): 46.08


  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(


Forecast for ('Ball_5'): 58.56
Forecast for ('Ball_Bonus'): 13.50


In [40]:
results = []
# Loop through each column and fit ARIMA models
for column in COLUMNS_TO_FORECAST:
    ts_data = all_games[column].astype(float)
    # Generate predictions from each ARIMA model
    result = 0
    for model in arima_models:
        final_model = model.model
        final_model_fit = final_model.fit(ts_data, disp=0)

        # Forecast the next data point
        forecast = final_model_fit.predict(n_periods=len(ts_data),steps=1)

        result += forecast[len(ts_data)-1]
    
    results.append(f'{column} -> {int(result/len(arima_models))} weight -> {weights.get(column).get(int(result/len(arima_models)))}')

  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return

In [42]:
for x in results:
    print (x)

Ball_1 -> 12 weight -> 5.016708426119214e-05
Ball_2 -> 23 weight -> 5.5741204734657925e-05
Ball_3 -> 35 weight -> 6.131532520812373e-05
Ball_4 -> 47 weight -> 5.2954144497925034e-05
Ball_5 -> 58 weight -> 6.410238544485662e-05
Ball_Bonus -> 13 weight -> 6.410238544485662e-05
