## Importing Packages

In [None]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
import requests
import time
from IPython.display import clear_output

import tensorflow as tf
from tensorflow.keras import backend as K
from tensorflow.keras import Sequential, Model
from tensorflow.keras.layers import Bidirectional, LSTM, Dense, Conv1D
from tensorflow.keras.optimizers import Adam

## Getting Medium Stories

In [None]:
def getData(url, date):
  # Getting The Page Data
  page = requests.get(url)
  soup = BeautifulSoup(page.text, 'html.parser')

  # Finding Specific Part in the HTML Page
  stories = soup.find_all('div', class_ = 'streamItem streamItem--postPreview js-streamItem')
  for story in stories:
    # print('Started Extracting Story')
    # start = time.time()
    each_story = []

    # Getting Author Data
    author_box = story.find('div', class_ = 'postMetaInline u-floatLeft u-sm-maxWidthFullWidth')
    author_url = author_box.find('a')['href']
    
    try:
        reading_time = author_box.find('span', class_ = 'readingTime')['title']
    except:
        continue

    # Getting Title and Subtitle of The Article
    title = story.find('h3').text if story.find('h3') else '-'
    subtitle = story.find('h4').text if story.find('h4') else '-'

    # Finding Claps and Respones
    if story.find('button', 
                  class_ = 'button button--chromeless u-baseColor--buttonNormal js-multirecommendCountButton u-disablePointerEvents'):
      
        claps = story.find('button', 
                           class_ = 'button button--chromeless u-baseColor--buttonNormal js-multirecommendCountButton u-disablePointerEvents').text
    else:
        claps = 0

    if story.find('a', class_ = 'button button--chromeless u-baseColor--buttonNormal'):
        responses = story.find('a', class_ = 'button button--chromeless u-baseColor--buttonNormal').text
    else:
        responses = '0 responses'

    # Getting The Actual Story
    story_url = story.find('a', class_ = 'button button--smaller button--chromeless u-baseColor--buttonNormal')['href']

    reading_time = reading_time.split()[0]
    responses = responses.split()[0]

    # Getting The Full Story Of The Article
    story_page = requests.get(story_url)
    story_soup = BeautifulSoup(story_page.text, 'html.parser')
    
    # Getting All Sections and Paragraphs of the Article
    sections = story_soup.find_all('section')
    story_paragraphs = []
    section_titles = []

    for section in sections:
      paragraphs = section.find_all('p')
      for paragraph in paragraphs:
          story_paragraphs.append(paragraph.text)

      subs = section.find_all('h1')
      for sub in subs:
          section_titles.append(sub.text)

    number_sections = len(section_titles)
    number_paragraphs = len(story_paragraphs)

    each_story.append(date); each_story.append(title); each_story.append(subtitle); each_story.append(claps); each_story.append(responses)
    each_story.append(author_url); each_story.append(story_url); each_story.append(reading_time); each_story.append(number_sections)
    each_story.append(section_titles); each_story.append(number_paragraphs); each_story.append(story_paragraphs)
    # end = time.time()
    # print('Time for 1 story:', (end - start))
    stories_data.append(each_story)


### Checking For One Day

In [None]:
# stories_data = [] 
# start = time.time()
# getData('https://medium.com/swlh/archive/2021/03/17', '03/17/21')
# end = time.time()
# print('Time for 1 day:', (end - start))

Time for 1 day: 19.841703176498413


### Creating Proper Data Over Long Duration

In [None]:
stories_data = []
for month in range(6, 13):
    if month in [7, 8, 10, 12]: n_days = 31
    elif month in [6, 9, 11]: n_days = 30
    else: n_days = 28
    # print('Month', month)
    for day in range(1, n_days + 1):
        print('Month/Day', month, '/', day)
        month, day = str(month), str(day)
        if len(month) == 1: month = f'0{month}'
        if len(day) == 1: day = f'0{day}'
        
        # Setting For Everyday of Everymonth
        date = f'{month}/{day}/2020'
        url = f'https://medium.com/swlh/archive/2020/{month}/{day}'

        # Getting The Page Data
        getData(url, date)
        if i > 15000: break

In [None]:
columns = ['date', 'title', 'subtitle', 'claps', 'responses', 'author_url', 'story_url', 'reading_time (mins)', 
           'number_sections', 'section_titles', 'number_paragraphs', 'paragraphs']

df = pd.DataFrame(stories_data, columns = columns)
df.shape

(8322, 12)

In [None]:
# df.to_csv('/content/drive/MyDrive/Colab Notebooks/Big Data/mediumdata.csv', index = False)

In [None]:
df.tail(10)

Unnamed: 0,date,title,subtitle,claps,responses,author_url,story_url,reading_time (mins),number_sections,section_titles,number_paragraphs,paragraphs
8312,10/01/2020,React Hooks: A Dream Come True,-,67,1,https://medium.com/@sadesina-96,https://medium.com/swlh/react-hooks-a-dream-co...,5,5,"[React Hooks: A Dream Come True, What are thes...",19,"[Today, I want to share my passion of the Reac..."
8313,10/01/2020,Time for Innovation: How to Inspire Your Team’...,With the right rituals and…,81,0,https://medium.com/@juliatsoihk,https://medium.com/swlh/time-for-innovation-ho...,5,7,[Time for Innovation: How to Inspire Your Team...,28,"[As a design strategist, I work closely with t..."
8314,10/01/2020,Getting Started With Cloud One Application Sec...,-,104,0,https://medium.com/@terryleehillis,https://medium.com/swlh/getting-started-with-c...,10,7,[Getting Started With Cloud One Application Se...,52,[This guide is designed to teach you how to cr...
8315,10/01/2020,How Lifting 205 Kg Helps Me Become a Better De...,-,53,1,https://medium.com/@nguyentran-ngo,https://medium.com/swlh/how-lifting-205-kg-hel...,5,7,[How Lifting 205 Kg Helps Me Become a Better D...,22,"[It was Saturday, January 6, 2018. I walked in..."
8316,10/01/2020,Why Dealing With Toxic People Starts With Deve...,-,39,0,https://medium.com/@anamarinovic,https://medium.com/swlh/why-dealing-with-toxic...,8,1,[Why Dealing With Toxic People Starts With Dev...,39,"[THE ULTIMATE GUIDE TO THRIVING AT WORK, The m..."
8317,10/01/2020,"“Covid-19”, Data Analysis From the Inception t...",-,78,0,https://medium.com/@r0han_,https://medium.com/swlh/covid-19-data-analysis...,11,3,"[“Covid-19”, Data Analysis From the Inception ...",31,[One might consider Nuclear Warfare and a Clim...
8318,10/01/2020,-,Can creations surpass the creator?,89,2,https://medium.com/@joanna-tan,https://medium.com/swlh/a-human-wrote-this-are...,8,0,[],36,"[A Human Wrote This…Are You Sure?, For a long ..."
8319,10/01/2020,An Undesirable Consequence of Handling All Exc...,what happens when you blindly catch…,100,0,https://medium.com/@ali-muhammadimran,https://medium.com/swlh/the-undesirable-conseq...,2,1,[An Undesirable Consequence of Handling All Ex...,19,[New programmers are often confused about the ...
8320,10/01/2020,Control Systems: The Hidden Science,-,140,0,https://medium.com/@abhishekg468,https://medium.com/swlh/control-systems-the-hi...,9,7,"[Control Systems: The Hidden Science, Modern C...",29,[A couple of questions before we start.1- Have...
8321,10/01/2020,Shut Up And Grab a Whiteboard If You Want to B...,-,48,2,https://medium.com/@kevin-wanke,https://medium.com/swlh/shut-up-and-grab-a-whi...,7,4,[Shut Up And Grab a Whiteboard If You Want to ...,44,[After recently stumbling upon a commencement ...
