## **Import libraries**

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

## **Import data**

In [2]:
raw_df = pd.read_csv('./data_source/quiniela_nacional.csv')

df_nacional = raw_df.copy()
df_nacional.head()

Unnamed: 0,lottery_date,quiniela,period,position,result
0,2014-11-01,nacional,primera,1,3940
1,2014-11-01,nacional,primera,2,6857
2,2014-11-01,nacional,primera,3,609
3,2014-11-01,nacional,primera,4,8018
4,2014-11-01,nacional,primera,5,3232


## **Explore and clean data**

In [3]:
# Check data types, null and total values.
df_nacional.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 191060 entries, 0 to 191059
Data columns (total 5 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   lottery_date  191060 non-null  object
 1   quiniela      191060 non-null  object
 2   period        191060 non-null  object
 3   position      191060 non-null  int64 
 4   result        191060 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 7.3+ MB


### Fix data types
*Note: 'position' is categorical ordinal data, but to the effects of creating position groups -see betting options-, it will be treated as numerical data.*

In [4]:
# Modify 'lottery_date' to type DATE.
df_nacional['lottery_date'] = pd.to_datetime(df_nacional['lottery_date'])
# Parse 'result' as TEXT with 4 characters an leading zeros. This feature is categorical nominal data.
df_nacional['result'] = df_nacional['result'].astype(str).str.zfill(4)

df_nacional.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 191060 entries, 0 to 191059
Data columns (total 5 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   lottery_date  191060 non-null  datetime64[ns]
 1   quiniela      191060 non-null  object        
 2   period        191060 non-null  object        
 3   position      191060 non-null  int64         
 4   result        191060 non-null  object        
dtypes: datetime64[ns](1), int64(1), object(3)
memory usage: 7.3+ MB


In [5]:
# Check leading zeros for 'result'
df_nacional.head()

Unnamed: 0,lottery_date,quiniela,period,position,result
0,2014-11-01,nacional,primera,1,3940
1,2014-11-01,nacional,primera,2,6857
2,2014-11-01,nacional,primera,3,609
3,2014-11-01,nacional,primera,4,8018
4,2014-11-01,nacional,primera,5,3232


### Split results by betting options
(1 digit, 2 digits, 3 digits, 4 digits)

In [6]:
# 4 digits
df_nacional.rename(columns= {'result': 'digits_4'}, inplace=True)
# 3 digits
df_nacional['digits_3'] = df_nacional['digits_4'].str[-3:]
# 2 digits
df_nacional['digits_2'] = df_nacional['digits_4'].str[-2:]
# 1 digit
df_nacional['digits_1'] = df_nacional['digits_4'].str[-1:]

df_nacional.head()

Unnamed: 0,lottery_date,quiniela,period,position,digits_4,digits_3,digits_2,digits_1
0,2014-11-01,nacional,primera,1,3940,940,40,0
1,2014-11-01,nacional,primera,2,6857,857,57,7
2,2014-11-01,nacional,primera,3,609,609,9,9
3,2014-11-01,nacional,primera,4,8018,18,18,8
4,2014-11-01,nacional,primera,5,3232,232,32,2


### Look at distributions

In [7]:
# Check the distrubtion of results
fig, (d1, d2, d3, d4) = plt.subplots(4, figsize=(15,10))
fig.suptitle('Results distribution by number of digits')

d1.set_title('1 Digit')
d1.hist(df_nacional['digits_1'])
d2.set_title('2 Digits')
d2.hist(df_nacional['digits_2'])
d3.set_title('3 Digits')
d3.hist(df_nacional['digits_3'])
d4.set_title('4 Digits')
d4.hist(df_nacional['digits_4'])

(array([20060., 19990., 19831., 19892., 19482., 19181., 19121., 18636.,
        18241., 16626.]),
 array([   0. ,  999.9, 1999.8, 2999.7, 3999.6, 4999.5, 5999.4, 6999.3,
        7999.2, 8999.1, 9999. ]),
 <BarContainer object of 10 artists>)

play:
a 1
a los 5
a los 10
a los 15
a los 20