<a href="https://colab.research.google.com/github/Beadsworth/cool_data/blob/master/Benfords_Law_reddit_subs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install jupyter-dash pandas

In [None]:
!pip install bs4

In [None]:
!pip install tqdm

In [159]:
import bs4
import numpy as np
import pandas as pd
import requests
import tqdm
import matplotlib.pyplot as plt
import math
import plotly.graph_objects as go
import time
import functools

In [190]:
class SubRipper:

  def __init__(self, number_of_pages, rate_limit_pause_secs=0):
    self.number_of_pages = number_of_pages
    self._rate_limit_pause_secs = rate_limit_pause_secs

  @staticmethod
  def string_2_int(num_string):
    return int(num_string.replace(',', ''))

  @staticmethod
  def get_first_digit(num):
    return int(str(num)[0])

  @staticmethod
  def benfords_law(digit):
    return math.log10((digit+1)/digit)

  @classmethod
  def rip_one_page(cls, page_num):
    """
    get stats from one page's table

    page_num = 0 -> offset = 0 -> 1 - 100
    page_num = 1 -> offset = 100 -> 101 - 200
    page_num = 2 -> offset = 200 -> 201 - 300
    """

    # find page
    offset = 100 * page_num
    url = f'https://frontpagemetrics.com/top/offset/{offset}'
    page = requests.get(url=url)
    
    # parse page
    soup = bs4.BeautifulSoup(page.content, 'html.parser')
    table = soup.find('table')

    # iterate through table
    # first row is table header (skip it)
    dict_list = []
    for tr in table.find_all('tr')[1:]:
      td = tr.find_all('td')
      row = {
          'rank': cls.string_2_int(td[0].text),
          'subreddit': td[1].text,
          'subscribers': cls.string_2_int(td[2].text)  
      }
      dict_list.append(row)

    table_stats = pd.DataFrame(dict_list)
    return table_stats

  @property
  @functools.lru_cache(maxsize=1)
  def subreddit_data_df(self):

    # iterate through all pages
    df_list = []
    for page in tqdm.notebook.tqdm(range(self.number_of_pages), desc=f'parsing {self.number_of_pages} pages'):
      df_list.append(self.rip_one_page(page))
      time.sleep(self._rate_limit_pause_secs)

    # clean up
    all_data_df = pd.concat(df_list)
    all_data_df = all_data_df.sort_values(by='subscribers', ascending=False).reset_index(drop=True)

    # calculate first digits
    all_data_df['first_digit'] = all_data_df.apply(axis=1, func=lambda row: self.get_first_digit(row['subscribers']))


    return all_data_df

  def subreddit_data_df_lower_bound(self, lower_bound=0):
    return self.subreddit_data_df[self.subreddit_data_df['subscribers'] >= lower_bound]

  def plot_hist(self, lower_bound=0):

    # Benford's Law theoretical trend
    digits = [n for n in range(1, 9+1)]
    law = [benfords_law(d) for d in digits]

    # contruct and plot "plotly" figure
    fig = go.Figure()

    # probability histogram
    fig.add_trace(go.Histogram(
        x=self.subreddit_data_df_lower_bound(lower_bound)['first_digit'],
        histnorm='probability',
        name='Reddit subscribers', # name used in legend and hover labels
        xbins=dict( # bins used for histogram
            start=0.5,
            end=9.5,
            size=1
        ),
        marker_color='#EB89B5',
        opacity=0.75
    ))

    # bar chart with expectation values
    fig.add_trace(go.Bar(
        x=digits,
        y=law,
        name="Benford's Law",
        marker_color='#330C73',
        opacity=0.75
    ))

    fig.update_layout(
        title_text="Subreddit Subscribers & Benford's Law", # title of plot
        xaxis_title_text='Subscriber Count First Digit', # xaxis label
        yaxis_title_text='Probability', # yaxis label
        bargap=0.2, # gap between bars of adjacent location coordinates
        bargroupgap=0.1 # gap between bars of the same location coordinates
    )

    fig.show()
    

In [194]:
ripper = SubRipper(number_of_pages=250)

In [199]:
ripper.subreddit_data_df_lower_bound(lower_bound=10_000)

Unnamed: 0,rank,subreddit,subscribers,first_digit
0,1,/r/announcements,68989234,6
1,2,/r/funny,33172780,3
2,3,/r/AskReddit,29786043,2
3,4,/r/gaming,27959896,2
4,5,/r/aww,26629516,2
...,...,...,...,...
20503,20203,/r/TelegramBots,10003,1
20504,20108,/r/ystarpro,10002,1
20505,20241,/r/Illenium,10002,1
20506,21836,/r/GoneWildPublic,10001,1


In [200]:
ripper.plot_hist(lower_bound=10_000)

In [157]:


fig = go.Figure()
fig.add_trace(go.Histogram(
    x=first_digits,
    histnorm='probability',
    name='Reddit subscribers', # name used in legend and hover labels
    xbins=dict( # bins used for histogram
        start=0.5,
        end=9.5,
        size=1
    ),
    marker_color='#EB89B5',
    opacity=0.75
))

fig.add_trace(go.Bar(
    x=digits,
    y=law,
    # histnorm='percent',
    name="Benford's Law",
    # xbins=dict(
    #     start=1,
    #     end=9,
    #     size=1
    # ),
    marker_color='#330C73',
    opacity=0.75
))

fig.update_layout(
    title_text="Subreddit Subscribers & Benford's Law", # title of plot
    xaxis_title_text='Subscriber Count First Digit', # xaxis label
    yaxis_title_text='Probability', # yaxis label
    bargap=0.2, # gap between bars of adjacent location coordinates
    bargroupgap=0.1 # gap between bars of the same location coordinates
)

fig.show()