# Scraping Scratcher Data

In [None]:
from bs4 import BeautifulSoup
import pandas as pd
import requests
import time

class Scratcher:
    """
    A class used to represent a Lottery Scratcher Game

    Attributes
    ----------
    name : str
        the title given to the scratcher game
    last_update : datetime
        date that the distribution of prizes was last updated
    price : int
        price of a single scratcher
    game_start : datetime
        date that the game was first sold
    prize_table : pandas dataframe object
        dataframe that contains the available prizes and the amount of each remaining
    overall_odds : float
        the overall odds that the scratcher wins any prize at all
    num_tickets : float
        the number of tickets at the start of the game
    orig_ev : float
        the original expected value of the scratcher
    remaining_tickets : float
        the estimated amount of tickets that remain unsold
    cur_ev : float
        the current value of the scratcher based on the amount of prizes remaining
    orig_ROI : float
        the original return on investment at the start of the game
    cur_ROI : float
        the current return on investment based on the amount of prizes remaining
    """
    
    def __init__(self, name, last_update, price, game_start, prize_table, overall_odds):
        '''Initializes Scratcher object and calculates the estimated number of tickets remaining and
        the starting and current expected values of the Scratcher'''
        self.name = name
        self.last_update = last_update
        self.price = price
        self.game_start = game_start
        self.prize_table = prize_table
        self.overall_odds = overall_odds
        self.num_tickets = self.prize_table["PrizesAtStart"].sum() * self.overall_odds
        self.orig_ev = self.prize_table["StartValue"].sum() / self.num_tickets
        self.remaining_tickets = self.num_tickets * self.prize_table["UnclaimedPrizes"].sum(
                                                        ) / self.prize_table["PrizesAtStart"].sum()
        self.cur_ev = self.prize_table["UnclaimedValue"].sum() / self.remaining_tickets
        self.orig_ROI = (self.orig_ev - self.price) / self.price
        self.cur_ROI = (self.cur_ev - self.price) / self.price
        

def clean_table(table):
    '''Cleans prize distribution tables so that all numeric variables can be read in as floats'''
    table.columns = ["PrizeAmount", "UnclaimedPrizes", "UnclaimedValue", "PrizesAtStart", "StartValue"]
    for col in table.columns:
        if table[col].dtype == "object":
            table[col] = table[col].str.replace("$", "").str.replace(",", "").astype(float)
    return table
        
def get_scratcher(scratcher_html):
    '''Obtains all data necessary for a Scratcher object given the html code for that scratcher'''
    scratcher_url = "https://www.sceducationlottery.com" + scratcher.find("a", href=True)["href"]
    r = requests.get(scratcher_url)
    time.sleep(1.5)
    soup = BeautifulSoup(r.content, "html.parser")
    name = soup.findAll("h1")[0].text
    for char in bad_chars: name = name.replace(char, "")
    # Obtaining Scratcher information from html
    name = name.strip()
    info = soup.findAll("div", {"class":"info-block"})
    last_update = ":".join(info[0].text.split(":")[1:3])
    price = int(info[1].text.split("$")[1])
    game_start = info[2].text.split(":")[1].strip()
    prize_table = clean_table(pd.DataFrame(pd.read_html(scratcher_url)[0]))
    overall_odds = float(soup.findAll("div", {"class":"bottom-links"})[0].findAll("p")[0].text.split(
                                                                                "\r")[0].split("in")[1])
    return Scratcher(name, last_update, price, game_start, prize_table, overall_odds)

In [None]:
url = "https://www.sceducationlottery.com/Games/InstantGames#" # url where all current Scratcher games are contained
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
scratcher_data = soup.findAll("div", {"class":"col-md-4", "class":"col-sm-6"})

bad_chars = ["\r", "\n"] # Characters removed from Scratcher names
scratchers = []

for scratcher in scratcher_data:
    scratchers.append(get_scratcher(scratcher))

# Creating dataframe of all Scratchers and their statistics
scratcher_df = pd.DataFrame({"Name": [s.name for s in scratchers], "LastUpdate": [s.last_update for s in scratchers],
             "Price": [s.price for s in scratchers], "GameStart": [s.game_start for s in scratchers],
             "OverallOdds": [s.overall_odds for s in scratchers], "NumTickets": [s.num_tickets for s in scratchers],
             "OrigEV": [s.orig_ev for s in scratchers], "RemainingTickets": [s.remaining_tickets for s in scratchers],
             "CurrentEV": [s.cur_ev for s in scratchers], "OrigROI": [s.orig_ROI for s in scratchers],
             "CurROI": [s.cur_ROI for s in scratchers]}).drop_duplicates()

scratcher_df.head()

## Data Visualization

### Graphing Average Return by Price of Scratcher

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(rc={"figure.figsize":(12, 8)})
scratcher_df.groupby("Price")["OrigEV"].mean().plot.bar(color=sns.color_palette("hls", 6), edgecolor="black", linewidth=2)
plt.xticks(fontsize=16, rotation='horizontal')
plt.yticks(fontsize=16)
plt.xlabel("Price of Scratcher in $", fontsize=18)
plt.ylabel("Average Return of Scratcher in $", fontsize=18)
plt.title("Average Return of Scratchers by Price", fontsize=20)

### Graphing Average Return per Dollar by Price of Scratcher

In [None]:
sns.set(rc={"figure.figsize":(12, 8)})
(scratcher_df.groupby("Price")["OrigEV"].mean() / scratcher_df.groupby("Price")["Price"].mean()
                        ).plot.bar(color=sns.color_palette("hls", 6), edgecolor="black", linewidth=2)
plt.xticks(fontsize=16, rotation='horizontal')
plt.yticks(fontsize=16)
plt.xlabel("Price of Scratcher in $", fontsize=18)
plt.ylabel("Average Return of Scratcher per Dollar", fontsize=18)
plt.title("Average Return of Scratchers per Dollar by Price", fontsize=20)
plt.ylim(.55, .8)

### Graphing Average Expected Loss of Scratcher by Price

In [None]:
sns.set(rc={"figure.figsize":(12, 8)})
(scratcher_df.groupby("Price")["OrigEV"].mean() - scratcher_df.groupby("Price")["Price"].mean()).plot.bar(
                                            color=sns.color_palette("hls", 6), edgecolor="black", linewidth=2)
plt.xticks(fontsize=16, rotation='horizontal')
plt.yticks(fontsize=16)
plt.xlabel("Price of Scratcher in $", fontsize=18)
plt.ylabel("Average Expected Loss of Scratcher in $", fontsize=18)
plt.title("Average Expected Loss of Scratcher by Price", fontsize=20)
plt.ylim(.5, -5)
plt.gca().invert_yaxis()
plt.hlines(y=0, xmin=-2, xmax=10)

### Graphing the Distribution of Updated ROI's of Scratchers

In [None]:
sns.set(rc={"figure.figsize":(12, 8)})
((scratcher_df["CurrentEV"] - scratcher_df["Price"]) / scratcher_df["Price"]).plot.hist(bins=20)
plt.xlabel("Return on Investment of Scratcher", fontsize=18)
plt.ylabel("Frequency", fontsize=18)
plt.title("Distribution of Updated Return on Investment's of Scratchers", fontsize=20)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)

## Finding Scratchers with Positive Current ROI's

In [None]:
def find_positive_rois(scratcher_df):
    df = scratcher_df[scratcher_df["CurROI"] > 0]
    print("There are currently %d scratcher(s) with positive ROIs\n"%df.shape[0])
    for index, row in df.iterrows():
        print(row["Name"], ":", "\nPrice:", row["Price"], "\nCurrent Expected Value:", 
                    round(row["CurrentEV"], 2), "\nCurrent Return on Investment", row["CurROI"], "\n")
        
find_positive_rois(scratcher_df)