<a href="https://colab.research.google.com/github/Beadsworth/cool_data/blob/master/Benfords_Law_Subreddits.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install bs4

In [None]:
!pip install tqdm

In [159]:
import bs4
import numpy as np
import pandas as pd
import requests
import tqdm
import matplotlib.pyplot as plt
import math
import plotly.graph_objects as go
import time
import functools

In [248]:
class SubRipper:

  def __init__(self, number_of_pages, rate_limit_pause_secs=0):
    self.number_of_pages = number_of_pages
    self._number_of_subreddits = self.number_of_pages * 100
    self._rate_limit_pause_secs = rate_limit_pause_secs

  @staticmethod
  def string_2_int(num_string):
    return int(num_string.replace(',', ''))

  @staticmethod
  def get_first_digit(num):
    return int(str(num)[0])

  @staticmethod
  def benfords_law(digit):
    return math.log10((digit+1)/digit)

  @classmethod
  def rip_one_page(cls, page_num):
    """
    get stats from one page's table

    page_num = 0 -> offset = 0 -> 1 - 100
    page_num = 1 -> offset = 100 -> 101 - 200
    page_num = 2 -> offset = 200 -> 201 - 300
    """

    # find page
    offset = 100 * page_num
    url = f'https://frontpagemetrics.com/top/offset/{offset}'
    page = requests.get(url=url)
    
    # parse page
    soup = bs4.BeautifulSoup(page.content, 'html.parser')
    table = soup.find('table')

    # iterate through table
    # first row is table header (skip it)
    dict_list = []
    for tr in table.find_all('tr')[1:]:
      td = tr.find_all('td')
      row = {
          'rank': cls.string_2_int(td[0].text),
          'subreddit': td[1].text,
          'subscribers': cls.string_2_int(td[2].text)  
      }
      dict_list.append(row)

    table_stats = pd.DataFrame(dict_list)
    return table_stats

  @property
  @functools.lru_cache(maxsize=1)
  def subreddit_data_df(self):
    #TODO: save to file

    # iterate through all pages
    df_list = []
    for page in tqdm.notebook.tqdm(range(self.number_of_pages), desc=f'parsing {self.number_of_pages} pages'):
      df_list.append(self.rip_one_page(page))
      time.sleep(self._rate_limit_pause_secs)

    # clean up
    all_data_df = pd.concat(df_list)
    all_data_df = all_data_df.sort_values(by='subscribers', ascending=False).reset_index(drop=True)

    # calculate first digits
    all_data_df['first_digit'] = all_data_df.apply(axis=1, func=lambda row: self.get_first_digit(row['subscribers']))


    return all_data_df

  def subreddit_data_df_lower_bound(self, lower_bound=0):
    return self.subreddit_data_df[self.subreddit_data_df['subscribers'] >= lower_bound]

  def plot_hist(self, lower_bound=0):

    # Benford's Law theoretical trend
    digits = [n for n in range(1, 9+1)]
    law = [benfords_law(d) for d in digits]

    # contruct and plot "plotly" figure
    fig = go.Figure()

    # probability histogram
    fig.add_trace(go.Histogram(
        x=self.subreddit_data_df_lower_bound(lower_bound)['first_digit'],
        histnorm='probability',
        name='Empirical Data from Reddit', # name used in legend and hover labels
        xbins=dict( # bins used for histogram
            start=0.5,
            end=9.5,
            size=1
        ),
        opacity=0.75
    ))

    # bar chart with expectation values
    fig.add_trace(go.Bar(
        x=digits,
        y=law,
        name="Benford's Law Prediction",
        opacity=0.75
    ))

    # lower bound message
    lower_bound_txt = f'<br>subreddits with less than {lower_bound} subscribers were excluded' if lower_bound > 0 else ''

    fig.update_layout(
        title_text=f"Subreddit Subscriber Counts & Benford's Law<br>Data taken from top {self._number_of_subreddits} subreddits{lower_bound_txt}", # title of plot
        xaxis_title_text='First Digit of Subscriber Count', #  xaxis label
        yaxis_title_text='Probability', # yaxis label
        bargap=0.2, # gap between bars of adjacent location coordinates
        bargroupgap=0.1 # gap between bars of the same location coordinates
    )

    fig.show()
    

In [250]:
ripper = SubRipper(number_of_pages=10, rate_limit_pause_secs=1)

In [251]:
ripper.subreddit_data_df_lower_bound(lower_bound=10_000)

HBox(children=(FloatProgress(value=0.0, description='parsing 10 pages', max=10.0, style=ProgressStyle(descript…

KeyboardInterrupt: ignored

In [None]:
ripper.plot_hist(lower_bound=10_000)