This script shows how to make an URL request and extract data fast using pandas

The objective is to access lottery website and get all the results to identify some statistics.

In [1]:
import requests
import pandas as pd
import collections

#url = 'http://loterias.caixa.gov.br/wps/portal/loterias/landing/lotofacil/!ut/p/a1/04_Sj9CPykssy0xPLMnMz0vMAfGjzOLNDH0MPAzcDbz8vTxNDRy9_Y2NQ13CDA0sTIEKIoEKnN0dPUzMfQwMDEwsjAw8XZw8XMwtfQ0MPM2I02-AAzgaENIfrh-FqsQ9wBmoxN_FydLAGAgNTKEK8DkRrACPGwpyQyMMMj0VAcySpRM!/dl5/d5/L2dBISEvZ0FBIS9nQSEh/pw/Z7_HGK818G0K85260Q5OIRSC42046/res/id=historicoHTML/c=cacheLevelPage/=/'
url = 'https://servicebus2.caixa.gov.br/portaldeloterias/api/resultados?modalidade=Lotofácil'
# url = sys.argv[1]

The module requests gets the http response from the link, to get the information you use the atribute "text"

In [2]:
# verify=False should only be used for testing. This does not verify the SSL ceritificate of the URL.
# This was made to avoid requests.exceptions.SSLError problem on caixa website.
r = requests.get(url, verify=False)



In [3]:
r.text
r_text = r.text

The text came with lots of end of carriage and new line strings. This should be removed.

In [4]:
r_text = r.text.replace('\\r\\n', '').replace('"\r\n}', '').replace('{\r\n "html": "', '')

Now you need to convert it to a dataframe

In [5]:
df = pd.read_html(r_text)

Note: This function returns a list of dataframes if more things are found. So we need to extract the only dataframe available in this list.

In [6]:
type(df)
type(df[0])

df=df[0].copy()

In [7]:
df

Unnamed: 0,Concurso,Data_Sorteio,Bola1,Bola2,Bola3,Bola4,Bola5,Bola6,Bola7,Bola8,...,Ganhadores_12_Números,Ganhadores_11_Números,Valor_Rateio_15_Números,Valor_Rateio_14_Números,Valor_Rateio_13_Números,Valor_Rateio_12_Números,Valor_Rateio_11_Números,Acumulado_15_Números,Estimativa_Prêmio,Valor_Acumulado_Especial
0,1,29/09/2003,2.0,3.0,5.0,6.0,9.0,10.0,11.0,13.0,...,48807.0,257593.0,"R$49.765,82","R$689,84","R$10,00","R$4,00","R$2,00","R$0,00","R$0,00","R$0,00"
1,,BA,,,,,,,,,...,,,,,,,,,,
2,,PR,,,,,,,,,...,,,,,,,,,,
3,,SP,,,,,,,,,...,,,,,,,,,,
4,2,06/10/2003,1.0,4.0,5.0,6.0,7.0,9.0,11.0,12.0,...,81252.0,478188.0,"R$596.323,70","R$1.388,95","R$10,00","R$4,00","R$2,00","R$0,00","R$0,00","R$0,00"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11023,SETE LAGOAS,MG,,,,,,,,,...,,,,,,,,,,
11024,2726,28/01/2023,2.0,5.0,6.0,8.0,9.0,10.0,11.0,14.0,...,106657.0,609632.0,"R$552.720,01","R$1.573,21","R$25,00","R$10,00","R$5,00","R$0,00","R$1.500.000,00","R$44.259.802,20"
11025,SALVADOR,BA,,,,,,,,,...,,,,,,,,,,
11026,BRASILIA,DF,,,,,,,,,...,,,,,,,,,,


Several NaN values appeared caused by bad website formatting that causes null lines to be generated. To clear it you can simply filter these values.

In [8]:
df = df[df['Bola1'] == df['Bola1']]
df

Unnamed: 0,Concurso,Data_Sorteio,Bola1,Bola2,Bola3,Bola4,Bola5,Bola6,Bola7,Bola8,...,Ganhadores_12_Números,Ganhadores_11_Números,Valor_Rateio_15_Números,Valor_Rateio_14_Números,Valor_Rateio_13_Números,Valor_Rateio_12_Números,Valor_Rateio_11_Números,Acumulado_15_Números,Estimativa_Prêmio,Valor_Acumulado_Especial
0,1,29/09/2003,2.0,3.0,5.0,6.0,9.0,10.0,11.0,13.0,...,48807.0,257593.0,"R$49.765,82","R$689,84","R$10,00","R$4,00","R$2,00","R$0,00","R$0,00","R$0,00"
4,2,06/10/2003,1.0,4.0,5.0,6.0,7.0,9.0,11.0,12.0,...,81252.0,478188.0,"R$596.323,70","R$1.388,95","R$10,00","R$4,00","R$2,00","R$0,00","R$0,00","R$0,00"
6,3,13/10/2003,1.0,4.0,6.0,7.0,8.0,9.0,10.0,11.0,...,96244.0,608211.0,"R$400.623,70","R$2.173,36","R$10,00","R$4,00","R$2,00","R$0,00","R$0,00","R$0,00"
8,4,20/10/2003,1.0,2.0,4.0,5.0,8.0,10.0,12.0,13.0,...,123912.0,706657.0,"R$902.226,02","R$1.498,72","R$10,00","R$4,00","R$2,00","R$0,00","R$0,00","R$0,00"
10,5,27/10/2003,1.0,2.0,4.0,8.0,9.0,11.0,12.0,13.0,...,195636.0,860992.0,"R$380.017,55","R$687,49","R$10,00","R$4,00","R$2,00","R$0,00","R$0,00","R$0,00"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11005,2722,24/01/2023,1.0,2.0,3.0,4.0,5.0,6.0,8.0,12.0,...,116187.0,622824.0,"R$180.959,17","R$1.022,72","R$25,00","R$10,00","R$5,00","R$0,00","R$1.500.000,00","R$42.812.734,77"
11014,2723,25/01/2023,1.0,2.0,3.0,6.0,7.0,8.0,10.0,12.0,...,116159.0,622008.0,"R$1.572.955,34","R$1.717,78","R$25,00","R$10,00","R$5,00","R$0,00","R$1.500.000,00","R$43.193.288,49"
11016,2724,26/01/2023,1.0,3.0,5.0,6.0,8.0,12.0,13.0,14.0,...,130472.0,643095.0,"R$619.494,19","R$784,62","R$25,00","R$10,00","R$5,00","R$0,00","R$1.500.000,00","R$43.493.043,75"
11019,2725,27/01/2023,1.0,2.0,4.0,5.0,7.0,8.0,9.0,10.0,...,139276.0,747934.0,"R$377.777,04","R$1.388,45","R$25,00","R$10,00","R$5,00","R$0,00","R$1.500.000,00","R$43.858.634,45"


You can check if it is correct matching the number of rows and the last Concurso value. If they are the same, everything is correct.

Now we are interested in quantifying how many times an even, odd and prime number are drawn. So for each we create a list containing all even, odd and primes in range 1 to 25.

In [9]:
# List of all possible numbers.
possible_numbers = list(range(1, 26))

# These three functions are created to apply a filter to the list containing all possible numbers.
def is_even(number):
    if (number % 2) == 0:
        return True
    else:
        return False

def is_odd(number):
    if (number % 2) != 0:
        return True
    else:
        return False

def is_prime(number):
    for i in range(2, number):
        if (number % i) == 0:
            return False
        else:
            pass
    return True

# Then filter is used to create three lists containing even, odd and prime numbers.
even_numbers = list(filter(is_even, possible_numbers))
odd_numbers = list(filter(is_odd, possible_numbers))
prime_numbers = list(filter(is_prime, possible_numbers))

We create a list containing the columns to be evaluated.

In [10]:
number_columns = ['Bola1', 'Bola2', 'Bola3', 'Bola4', 'Bola5',
              'Bola6', 'Bola7', 'Bola8', 'Bola9', 'Bola10', 'Bola11', 'Bola12',
              'Bola13', 'Bola14', 'Bola15']

Finally the values from column 'Bola 1' to 'Bola25' are considered floats, but should be integers.

In [11]:
df[number_columns] = df[number_columns].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[number_columns] = df[number_columns].astype(int)


Now we iterate over every column to sum the frequency of unique values, odds, evens and primes.

In [12]:
comb = []
value_count = dict.fromkeys(possible_numbers, 0)

for column in number_columns:
    # For odds, evens and primes we apply filter functions and sum the total amount for each column
    # Then we create a string containing the combination of the sums as a unique string and append it to a list
    comb.append(str(df[column].apply(is_even).sum()) + 'even-' + str(df[column].apply(is_odd).sum()) + 'odds-'+str(df[column].apply(is_prime).sum())+'primes')
    # For number counting we apply a value counts to count for each unique value found.
    # Then an iteraction is made to sum every column unique value frequency into a dictionary of possible numbers.
    for key, value in df[column].value_counts().items():
        value_count[key] += value

Now we can count every equal combination using Counter. The we can create a dataframe from it and calculate the probability of each combination to happen.

In [13]:
counter = collections.Counter(comb)
result = pd.DataFrame(counter.items(), columns=['Combinacao', 'Frequencia'])
result['p_freq'] = result['Frequencia']/result['Frequencia'].sum()
result = result.sort_values(by='p_freq')

Finally we can sort the most frequent values

In [14]:
sorted_value_count = sorted(value_count.items(), key=lambda x:x[1])

In [15]:
most_frequent_value = sorted_value_count[-1]
least_frequent_value = sorted_value_count[0]

In [16]:
print(f'''
The most frequent value is: {most_frequent_value[0]}
The least frequent value is: {least_frequent_value[0]}
The most frequent combination of odds, evens and primes is: {result['Combinacao'].values[-1]} or {int((result['p_freq'].values[-1]*100)*100)/100}%
''')


The most frequent value is: 20
The least frequent value is: 8
The most frequent combination of odds, evens and primes is: 754even-1972odds-260primes or 6.66%

