# Initial Imports

In [1]:
#imports.
import os
import pandas as pd
import numpy as np
from path import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

# Read in Data

In [3]:
# Loading data
file_path = Path("powerball_dataset_clean.csv")
df_pb = pd.read_csv(file_path)
df_pb.head()

Unnamed: 0,DrawDate,WB1,WB2,WB3,WB4,WB5,PB,PP,Date,Powerball Sales,PowerPlay Sales,Jackpot(Millions),Jackpot
0,2017/03/15,30,16,41,48,53,16,3,2017/03/15,29851224.0,2995984.0,123.0,123000000.0
1,2017/03/11,57,41,50,26,1,11,2,2017/03/11,32550214.0,3344208.0,104.0,104000000.0
2,2017/03/08,42,23,59,46,33,4,2,2017/03/08,26936478.0,2847616.0,85.0,85000000.0
3,2017/03/04,2,22,63,19,18,19,3,2017/03/04,28868654.0,3154145.0,68.0,68000000.0
4,2017/03/01,16,10,52,40,55,17,10,2017/03/01,25012404.0,2648030.0,53.0,53000000.0


In [14]:
#drop repeated date column
df_pb2=df_pb.drop("Date",axis=1)
df_pb2.head()

Unnamed: 0,DrawDate,WB1,WB2,WB3,WB4,WB5,PB,PP,Powerball Sales,PowerPlay Sales,Jackpot(Millions),Jackpot
0,2017/03/15,30,16,41,48,53,16,3,29851224.0,2995984.0,123.0,123000000.0
1,2017/03/11,57,41,50,26,1,11,2,32550214.0,3344208.0,104.0,104000000.0
2,2017/03/08,42,23,59,46,33,4,2,26936478.0,2847616.0,85.0,85000000.0
3,2017/03/04,2,22,63,19,18,19,3,28868654.0,3154145.0,68.0,68000000.0
4,2017/03/01,16,10,52,40,55,17,10,25012404.0,2648030.0,53.0,53000000.0


# Add in Collumns

In [22]:
#power ball prediction
df_pbp = df_pb2

In [23]:
#convert date
df_pbp['DrawDate']=pd.to_datetime(df_pbp['DrawDate'])

In [24]:
#add day of week column
df_pbp['dow']=df_pbp['DrawDate'].dt.dayofweek

In [25]:
#add previous draw stats
df_pbp['PWB1']=df_pbp['WB1'].shift(-1) #previous winning ball 1
df_pbp['PPB']=df_pbp['PB'].shift(-1) #previous power ball
df_pbp.dropna(inplace=True) 

In [26]:
#convert to integer
df_pbp['PWB1']=df_pbp['PWB1'].astype('int')
df_pbp['PPB']=df_pbp['PPB'].astype('int')

In [27]:
df_pbp

Unnamed: 0,DrawDate,WB1,WB2,WB3,WB4,WB5,PB,PP,Powerball Sales,PowerPlay Sales,Jackpot(Millions),Jackpot,dow,PWB1,PPB
0,2017-03-15,30,16,41,48,53,16,3,29851224.0,2995984.0,123.0,123000000.0,2,57,11
1,2017-03-11,57,41,50,26,1,11,2,32550214.0,3344208.0,104.0,104000000.0,5,42,4
2,2017-03-08,42,23,59,46,33,4,2,26936478.0,2847616.0,85.0,85000000.0,2,2,19
3,2017-03-04,2,22,63,19,18,19,3,28868654.0,3154145.0,68.0,68000000.0,5,16,17
4,2017-03-01,16,10,52,40,55,17,10,25012404.0,2648030.0,53.0,53000000.0,2,6,19
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2013,1997-11-22,22,31,3,7,14,2,0,8318146.0,0.0,22.0,22000000.0,5,19,22
2014,1997-11-19,19,10,48,3,21,22,0,6676271.0,0.0,18.0,18000000.0,2,32,8
2015,1997-11-15,32,7,45,21,26,8,0,7501534.0,0.0,16.0,16000000.0,5,41,1
2016,1997-11-12,41,14,13,29,16,1,0,6216170.0,0.0,14.0,14000000.0,2,40,37


In [37]:
#unique data
df_pbp['PB'].unique()

array([16, 11,  4, 19, 17,  2, 20,  5, 25, 10, 22, 12, 13, 24,  9, 21,  3,
       15,  7,  8, 23,  1, 14,  6, 26, 18, 27, 28, 35, 33, 29, 32, 30, 34,
       31, 37, 36, 38, 39, 41, 40, 42], dtype=int64)

In [38]:
#amount of unique data types
len(df_pbp['PB'].unique())

42

# Train the Model

In [39]:
X=df_pbp[['dow','PWB1','PPB']]
y = df_pbp['PB'] #target

In [40]:
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [41]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [42]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=len(X)+1, random_state=78) 

In [43]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [44]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)

In [45]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions,)


In [46]:
len(y_test.unique())

42

In [47]:
len(X_test_scaled)

505

In [48]:
len(y_test),len(predictions)

(505, 505)

In [49]:
#accurate predictions
pred_eval = pd.DataFrame({
    'y_test':y_test,
    'y_pred':predictions
})

In [50]:
pred_eval

Unnamed: 0,y_test,y_pred
317,4,38
979,10,13
1867,37,36
1066,34,29
1043,32,21
...,...,...
102,21,28
1895,13,31
508,29,5
88,3,34


In [51]:
pred_eval['eval']=pred_eval['y_pred']-pred_eval['y_test']

In [52]:
pred_eval

Unnamed: 0,y_test,y_pred,eval
317,4,38,34
979,10,13,3
1867,37,36,-1
1066,34,29,-5
1043,32,21,-11
...,...,...,...
102,21,28,7
1895,13,31,18
508,29,5,-24
88,3,34,31


In [53]:
# accurate prediction
pred_eval[pred_eval['eval']==0]

Unnamed: 0,y_test,y_pred,eval
272,29,29,0
1106,15,15,0
1672,22,22,0
19,10,10,0
729,8,8,0
682,23,23,0
374,32,32,0
1280,1,1,0
335,8,8,0


In [54]:
pred_eval[pred_eval['eval']==0].count()

y_test    9
y_pred    9
eval      9
dtype: int64

# Random Model

In [55]:
# random model 
random_lotto=np.random.randint(1,70,size=len(predictions))

In [56]:
pred_eval['rm'] = random_lotto

In [57]:
pred_eval['rm_eval'] = pred_eval['rm']-pred_eval['y_test']

In [58]:
#accurate predictions using random
pred_eval[pred_eval['rm_eval']==0]

Unnamed: 0,y_test,y_pred,eval,rm,rm_eval
1322,10,35,25,10,0
61,6,12,6,6,0
1879,13,8,-5,13,0
1483,30,14,-16,30,0
581,4,3,-1,4,0
966,41,35,-6,41,0


In [59]:
pred_eval[pred_eval['rm_eval']==0].count()

y_test     6
y_pred     6
eval       6
rm         6
rm_eval    6
dtype: int64