In [125]:
import bs4
import datetime
import logging
import pandas
import re
import requests
import time

In [2]:
baseurl = 'https://www.juegosonce.es'
start_endpoint = '/rascas-todos'

In [3]:
# extract the html
res = requests.get(f"{baseurl}{start_endpoint}")
# convert to bs4 object
soup = bs4.BeautifulSoup(res.text, 'html.parser')

In [4]:
# extract the appropriate section
section = soup.find('section', class_ = 'modulorasca favorita')

In [5]:
rascas = section.find_all('li', class_ = 'rascaOrdenable')

In [127]:
def extract_winnings_data(url, price, series_data = None, series_outcomes = None):
    """
    extract the html for the individual game
    """
    print(f"Extracting winnings data from {url=} and {price=}")
    res = requests.get(url)
    # convert to bs4 object
    soup = bs4.BeautifulSoup(res.text, 'html.parser')
    # extract the appropriate section
    section = soup.find('section', class_ = 'homerascas')
    # extract the section contents
    section_contents = soup.find_all('div', class_='contenido')
    # iterate through the section contents
    for contents in section_contents:
        # find all the content headers
        content = contents.find_all('h3')
        # iterate the content headers
        for con in content:
            print(f"  -- found header: {con.text}")
            # find the header that matches the price level
            if f"{price} €" in con.text or f"{price}0 €" in con.text:
                print(f"    -- found {price=} in {con.text=}")
                series_data = con.text
                series_outcomes = next_ul = con.find_next_sibling('ul')
    if series_data is None:
            print(f"No winnings data found for {price=}")
    if series_outcomes is None:
            print(f"No winnings outcomes found for {price=}")
    return series_data, series_outcomes

def extract_scenario_header(header, price):
    """
    Extracts the number of scenarios from the winnings header
    """
    # clean the text from the header
    scenarios = header.replace(f"{price} €",'').replace(f"{price}0 €",'')
    scenarios = scenarios.replace('Premios por cada serie de boletos de ', '')
    scenarios = scenarios.replace('con precio a', '')
    scenarios = scenarios.replace(' ','').replace(':','').replace('.','')
    # what remains is the number of scenarios
    return scenarios

def extract_scenario_pairs(content, discount_rate = .03):
    """
    Extracts the scenario pairs from the winnings unordered list
    """    
    scenario_pairs = []
    for li in content.find_all('li'):
        # extract the line of text
        premium_text = li.get_text()
        # extract the premium value
        premium_return = li.find('span').get_text()       
        # remove the premium value from the text
        premium_occurences = premium_text.replace(premium_return, '')
        # remove € sign and whitespaces from premium return              
        premium_return = premium_return.replace('€','').replace(' ','').replace('.','')
        # convert decimal separator from comma to period
        premium_return = premium_return.replace(',','.')      
        # check for annuities
        annuity = re.search(r"al año durante (\d{1,3}) años", premium_text)
        if annuity:
            logging.info('annuity found')
            # remove annuity info from the text
            premium_occurences = premium_occurences.replace(annuity.group(0), '')
            # extract the annuity duration
            premium_return = extract_annuity_scenario(annuity, premium_return)
        # remove ' premios de ' from the text
        premium_occurences = premium_occurences.replace(' premios de ', '').replace(' premio de ', '')
        # remove whitespace and periods from text
        premium_occurences = premium_occurences.replace('.', '').replace(' ','')
        pair = (premium_occurences, premium_return)
        scenario_pairs.append(pair)
    return scenario_pairs

def extract_annuity_scenario(annuity, premium_return, discount_rate = 0.03):
    """
    Extracts annuity info and computes present value
    """
    # extract the annuity duration
    premium_duration = annuity.group(1)
    # calculate the present value of the annuity
    pmt = int(premium_return)
    n = int(premium_duration)
    r = discount_rate
    pv = pmt * (1 - (1 + r)**-n) / r
    # present value becomes premium return 
    return pv

In [128]:
rascadict = {}
for rasca in rascas:
    # extract the data from li class
    rasca_id = rasca.get('data-producto')
    rasca_name = rasca.get('data-name')
    rasca_price = rasca.get('data-price')
    # convert price to list
    rasca_price = rasca_price.split(' - ')
    # instantiate multi price
    rasca_multi_price = False
    if len(rasca_price) > 1:
        rasca_multi_price = True

    # extract the url from a class informacionrasca
    rasca_url = rasca.find('a', class_='informacionrasca')
    rasca_url = rasca_url.get('href')
    rasca_url = baseurl + rasca_url
    
    # extract the img url
    rasca_img_url = rasca.find('img')
    rasca_img_url = rasca_img_url.get('src')

    # extract the date from span class ocu
    rasca_start_date = rasca.find('span', class_='ocu')
    rasca_start_date = rasca_start_date.get('data-fecha')
    
    # extract the payout from p class premio
    rasca_max_payout = rasca.find('p', class_='premio')
    rasca_max_payout = rasca_max_payout.get('data-premio')

    # iterate through prices
    for i in rasca_price:
        # extract the winnings data from the rasca URL
        winnings_data = extract_winnings_data(rasca_url, i)
        time.sleep(1)
        # store the data in a dict
        rascadict[f"{rasca_id}_EUR{i}"] = {
            'rasca_id': rasca_id,
            'name': rasca_name,
            'price': i,
            'active': True,
            'price_array': rasca_price,
            'multi_price': rasca_multi_price,
            'url': rasca_url,
            'img_url': rasca_img_url,
            'start_date': rasca_start_date,
            'max_payout': rasca_max_payout,
            'returns_text': winnings_data[0],
            'returns_html': winnings_data[1]}

Extracting winnings data from url='https://www.juegosonce.es/rasca-enigma' and price='5'
  -- found header: Premios por cada serie de boletos de 4.000.000 con precio a 5 €:
    -- found price='5' in con.text='Premios por cada serie de boletos de 4.000.000 con precio a 5 €:'
Extracting winnings data from url='https://www.juegosonce.es/rasca-super-millones' and price='10'
  -- found header: Premios por cada serie de boletos de 4.000.000 con precio a 10 €:
    -- found price='10' in con.text='Premios por cada serie de boletos de 4.000.000 con precio a 10 €:'
Extracting winnings data from url='https://www.juegosonce.es/rasca-astro-premios' and price='2'
  -- found header: Premios por cada serie de boletos de 2.000.000 con precio a 2 €:
    -- found price='2' in con.text='Premios por cada serie de boletos de 2.000.000 con precio a 2 €:'
Extracting winnings data from url='https://www.juegosonce.es/rasca-mega-millonario' and price='10'
  -- found header: Premios por cada serie de boletos de 4

In [129]:
len(rascadict)

81

In [135]:
scenariosdict = {}
for key, value in rascadict.items():
    # extract and convert price to float
    # extract the number of scenarios and scenario pairs from html
    scenarios = extract_scenario_header(value['returns_text'], value['price'])
    raw_scenario_pairs = extract_scenario_pairs(value['returns_html'])
    # subtract the price of the rasca from the return for each scenario
    logging.info(raw_scenario_pairs)
    clean_scenario_pairs = [(a, str(float(b) - float(value['price'].replace(',','.')))) for a, b in raw_scenario_pairs]
    logging.info(clean_scenario_pairs)
    logging.info(value['url'])
    logging.info(value['price'])
    # compute the number of winning scenarios
    winning_scenarios = sum(int(a) for a, b in raw_scenario_pairs)
    # subtract winning from total scenarios to get the # of losing scenarios
    losing_scenarios = int(scenarios) - winning_scenarios
    # add the losing scenario to the scenario pairs
    clean_scenario_pairs.append((str(losing_scenarios), str(-float(value['price'].replace(',','.')))))
    # calculate the expected return
    expected_return = sum(int(a)/int(scenarios)*float(b) for a, b in clean_scenario_pairs)

    logging.info(raw_scenario_pairs)
    logging.info(clean_scenario_pairs)
    logging.info(scenarios)
    logging.info(winning_scenarios)
    logging.info(losing_scenarios)
    logging.info(expected_return)

    scenariosdict[key] = {
        'rasca_id': value['rasca_id'],
        'name': value['name'],
        'price': float(value['price'].replace(',','.')),
        'active': value['active'],
        'price_array': value['price_array'],
        'multi_price': value['multi_price'],
        'url': value['url'],
        'img_url': value['img_url'],
        'start_date': datetime.datetime.utcfromtimestamp(int(value['start_date'])/1000),
        'max_payout': value['max_payout'],
        'returns_text': value['returns_text'],
        'returns_html': value['returns_html'],
        'scenarios': scenarios,
        'winning_scenarios': winning_scenarios,
        'losing_scenarios': losing_scenarios,
        'raw_scenario_pairs': raw_scenario_pairs,
        'clean_scenario_pairs': clean_scenario_pairs,
        'expected_return': expected_return,
        'expected_return_per_euro': expected_return / float(value['price'].replace(',','.'))}

results = pandas.DataFrame(scenariosdict).T

### Most and least profitable scratchcards

After normalising the expected return the most profitable scratch cards on the once website are Mega Millonario and Rasca Platinum with an expected return of -0.26/€

The worst game to play is currently Megablocks with an expected return of -0.673/€

In [136]:
results[['start_date','name','expected_return','expected_return_per_euro','url']].sort_values(by='expected_return_per_euro', ascending=False)

Unnamed: 0,start_date,name,expected_return,expected_return_per_euro,url
EI6_EUR10,2024-01-23 23:00:00,Mega Millonario,-2.6,-0.26,https://www.juegosonce.es/rasca-mega-millonario
EH2_EUR10,2023-04-18 22:00:00,Rasca Platinum,-2.6,-0.26,https://www.juegosonce.es/rasca-platinum
EC6_EUR10,2024-01-09 23:00:00,Super Millones,-2.695625,-0.269562,https://www.juegosonce.es/rasca-super-millones
EF2_EUR10,2024-01-09 23:00:00,Slingo Pro,-2.8,-0.28,https://www.juegosonce.es/rasca-slingo-pro
EG6_EUR5,2023-01-24 23:00:00,X20,-1.525,-0.305,https://www.juegosonce.es/rasca-x20
...,...,...,...,...,...
"EG7_EUR0,5",2023-02-01 23:00:00,7 de la Suerte,-0.2075,-0.415,https://www.juegosonce.es/rasca-7-de-la-suerte
"EI9_EUR0,5",2024-03-05 23:00:00,La araña de la suerte,-0.2075,-0.415,https://www.juegosonce.es/rasca-arana-suerte
E64_04_EUR1,2024-06-18 22:00:00,Lucky Summer,-0.42,-0.42,https://www.juegosonce.es/rasca-lucky-summer
"EG3_EUR0,5",2022-12-13 23:00:00,El Rincón del Duende,-0.21,-0.42,https://www.juegosonce.es/rasca-rincon-duende


### More recent Scratchcards

You can find the most recent scratchcards by scraping the unix timestamp of each game from the landing page

Although untested, the more recent games are expected to have the highest return per euro since all of the high-winning scenarios are still possible. Over time the expected return should drop substantially as more prizes are claimed

In [137]:
results[['start_date','name','expected_return','expected_return_per_euro','url']].sort_values(by='start_date', ascending=False)

Unnamed: 0,start_date,name,expected_return,expected_return_per_euro,url
EJ9_EUR5,2024-08-06 22:00:00,Rasca Enigma,-1.55,-0.31,https://www.juegosonce.es/rasca-enigma
ED3_01_EUR2,2024-07-30 22:00:00,Astro Premios,-0.73,-0.365,https://www.juegosonce.es/rasca-astro-premios
M17_EUR1,2024-07-16 22:00:00,Winning Words,-0.405,-0.405,https://www.juegosonce.es/rasca-winning-words
M17_EUR2,2024-07-16 22:00:00,Winning Words,-0.72,-0.36,https://www.juegosonce.es/rasca-winning-words
M17_EUR5,2024-07-16 22:00:00,Winning Words,-1.55,-0.31,https://www.juegosonce.es/rasca-winning-words
...,...,...,...,...,...
"M09_EUR0,5",2019-06-04 22:00:00,Pepitas de Oro,-0.205,-0.41,https://www.juegosonce.es/rasca-pepitas-oro
"E32_EUR0,5",2018-03-20 23:00:00,Triplex Express,-0.205,-0.41,https://www.juegosonce.es/rasca-triplex-express
E31_EUR1,2018-02-27 23:00:00,Super Once Express,-0.399962,-0.399962,https://www.juegosonce.es/rasca-super-once-exp...
E30_EUR1,2018-02-13 23:00:00,Rasca Cupón,-0.4,-0.4,https://www.juegosonce.es/rasca-cupon


### Average Expected Return

In [138]:
results['avg_expected_return_per_euro'] = results['expected_return_per_euro'].mean()
results['deviation_from_avg'] = results['expected_return_per_euro'] - results['avg_expected_return_per_euro']
results[['name','expected_return_per_euro', 'avg_expected_return_per_euro', 'deviation_from_avg']].sort_values(by='expected_return_per_euro', ascending=False)

Unnamed: 0,name,expected_return_per_euro,avg_expected_return_per_euro,deviation_from_avg
EI6_EUR10,Mega Millonario,-0.26,-0.366705,0.106705
EH2_EUR10,Rasca Platinum,-0.26,-0.366705,0.106705
EC6_EUR10,Super Millones,-0.269562,-0.366705,0.097142
EF2_EUR10,Slingo Pro,-0.28,-0.366705,0.086705
EG6_EUR5,X20,-0.305,-0.366705,0.061705
...,...,...,...,...
"EG7_EUR0,5",7 de la Suerte,-0.415,-0.366705,-0.048295
"EI9_EUR0,5",La araña de la suerte,-0.415,-0.366705,-0.048295
E64_04_EUR1,Lucky Summer,-0.42,-0.366705,-0.053295
"EG3_EUR0,5",El Rincón del Duende,-0.42,-0.366705,-0.053295
