In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import random
import os

In [3]:
urls = {
    'Bangla Media': 'https://medium.com/%E0%A6%AA%E0%A7%8D%E0%A6%B0%E0%A7%8B%E0%A6%97%E0%A7%8D%E0%A6%B0%E0%A6%BE%E0%A6%AE%E0%A6%BF%E0%A6%82-%E0%A6%AA%E0%A6%BE%E0%A6%A4%E0%A6%BE/{0}/{1:02d}/{2:02d}',
    'Bangla Medium archieve': 'https://medium.com/tag/bangla/archive/{0}/{1:02d}/{2:02d}',
}


In [4]:
response = requests.get('https://medium.com/tag/bangla/archive/{0}/{1:02d}/{2:02d}')

print(response)

<Response [200]>


In [5]:
soup = BeautifulSoup(response.text, 'lxml')

In [6]:
print(soup)

<!DOCTYPE html>
<html xmlns:cc="http://creativecommons.org/ns#"><head prefix="og: http://ogp.me/ns# fb: http://ogp.me/ns/fb# medium-com: http://ogp.me/ns/fb/medium-com#"><meta content="text/html; charset=utf-8" http-equiv="Content-Type"/><meta content="width=device-width, initial-scale=1.0, viewport-fit=contain" name="viewport"/><title>Archive of stories about Bangla – Medium</title><link href="https://medium.com/tag/bangla/archive" rel="canonical"/><meta content="index,follow" name="robots"/><meta content="Archive of stories about Bangla – Medium" name="title"/><meta content="unsafe-url" name="referrer"/><meta content="Discover all times top stories about Bangla on Medium." name="description"/><meta content="#000000" name="theme-color"/><meta content="Archive of stories about Bangla – Medium" property="og:title"/><meta content="Archive of stories about Bangla – Medium" property="twitter:title"/><meta content="https://medium.com/tag/bangla/archive" property="og:url"/><meta content="542

In [7]:
def is_leap(year):
    if year % 4 != 0:
        return False
    elif year % 100 != 0:
        return True
    elif year % 400 != 0:
        return False
    else:
        return True
    
def convert_day(day, year):
    month_days = [31, 29 if is_leap(year) else 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
    m = 0
    d = 0
    while day > 0:
        d = day
        day -= month_days[m]
        m += 1
    return (m, d)

def get_claps(claps_str):
    if (claps_str is None) or (claps_str == '') or (claps_str.split is None):
        return 0
    split = claps_str.split('K')
    claps = float(split[0])
    claps = int(claps*1000) if len(split) == 2 else int(claps)
    return claps

def get_img(img_url, dest_folder, dest_filename):
    ext = img_url.split('.')[-1]
    if len(ext) > 4:
        ext = 'jpg'
    dest_filename = f'{dest_filename}.{ext}'
    with open(f'{dest_folder}/{dest_filename}', 'wb') as f:
        f.write(requests.get(img_url, allow_redirects=False).content)
    return dest_filename

In [25]:
year = 2020
selected_days = random.sample([i for i in range(1, 367 if is_leap(year) else 366)], 150)

In [26]:
img_dir = 'images'
if not os.path.exists(img_dir):
    os.mkdir(img_dir)


In [36]:
data = []
article_id = 0
i = 0
n = len(selected_days)
for d in selected_days:
    i += 1
    month, day = convert_day(d, year)
    date = '{0}-{1:02d}-{2:02d}'.format(year, month, day)
    print(f'{i} / {n} ; {date}')
    for publication, url in urls.items():
        response = requests.get(url.format(year, month, day), allow_redirects=True)
        if not response.url.startswith(url.format(year, month, day)):
            continue
        page = response.content
        soup = BeautifulSoup(page, 'html.parser')
        articles = soup.find_all("div", class_="postArticle postArticle--short js-postArticle js-trackPostPresentation js-trackPostScrolls")
        for article in articles:
            title = article.find("h3", class_="graf--title")
            if title is None:
                continue
            title = title.contents[0]
            article_id += 1
            subtitle = article.find("h4", class_="graf--subtitle")
            subtitle = subtitle.contents[0] if subtitle is not None else ''
            image = article.find("img", class_="graf-image")
            image = '' if image is None else get_img(image['src'], 'images', f'{article_id}')
            article_url = article.find_all("a")[3]['href'].split('?')[0]
            claps = get_claps(article.find_all("button")[1].contents[0])
            reading_time = article.find("span", class_="readingTime")
            reading_time = 0 if reading_time is None else int(reading_time['title'].split(' ')[0])
            responses = article.find_all("a")
            if len(responses) == 3:
                responses = responses[2].contents[0].split(' ')
                if len(responses) == 0:
                    responses = 0
                else:
                    responses = responses[0]
            else:
                responses = 0

            data.append([article_id, article_url, title, subtitle, image, claps, responses, reading_time, publication, date])


1 / 150 ; 2020-07-09
2 / 150 ; 2020-07-20
3 / 150 ; 2020-01-10
4 / 150 ; 2020-12-22
5 / 150 ; 2020-02-26
6 / 150 ; 2020-02-03
7 / 150 ; 2020-09-22
8 / 150 ; 2020-05-04
9 / 150 ; 2020-03-15
10 / 150 ; 2020-02-16
11 / 150 ; 2020-02-20
12 / 150 ; 2020-08-25
13 / 150 ; 2020-05-20
14 / 150 ; 2020-06-02
15 / 150 ; 2020-08-21
16 / 150 ; 2020-01-26
17 / 150 ; 2020-09-02
18 / 150 ; 2020-08-16
19 / 150 ; 2020-09-23
20 / 150 ; 2020-02-05
21 / 150 ; 2020-07-18
22 / 150 ; 2020-07-13
23 / 150 ; 2020-06-08
24 / 150 ; 2020-03-21
25 / 150 ; 2020-05-23
26 / 150 ; 2020-10-06
27 / 150 ; 2020-10-07
28 / 150 ; 2020-12-20
29 / 150 ; 2020-11-21
30 / 150 ; 2020-05-15
31 / 150 ; 2020-07-04
32 / 150 ; 2020-05-18
33 / 150 ; 2020-02-01
34 / 150 ; 2020-04-01
35 / 150 ; 2020-10-22
36 / 150 ; 2020-05-27
37 / 150 ; 2020-03-11
38 / 150 ; 2020-04-10
39 / 150 ; 2020-03-07
40 / 150 ; 2020-09-15
41 / 150 ; 2020-02-22
42 / 150 ; 2020-10-31
43 / 150 ; 2020-05-31
44 / 150 ; 2020-09-09
45 / 150 ; 2020-02-04
46 / 150 ; 2020-09-

In [33]:
print(soup.get_text())

All stories about Bangla on June 08, 2020 – Medium!function(n,e){var t,o,i,c=[],f={passive:!0,capture:!0},r=new Date,a="pointerup",u="pointercancel";function p(n,c){t||(t=c,o=n,i=new Date,w(e),s())}function s(){o>=0&&o<i-r&&(c.forEach(function(n){n(o,t)}),c=[])}function l(t){if(t.cancelable){var o=(t.timeStamp>1e12?new Date:performance.now())-t.timeStamp;"pointerdown"==t.type?function(t,o){function i(){p(t,o),r()}function c(){r()}function r(){e(a,i,f),e(u,c,f)}n(a,i,f),n(u,c,f)}(o,t):p(o,t)}}function w(n){["click","mousedown","keydown","touchstart","pointerdown"].forEach(function(e){n(e,l,f)})}w(n),self.perfMetrics=self.perfMetrics||{},self.perfMetrics.onFirstInputDelay=function(n){c.push(n),s()}}(addEventListener,removeEventListener);document.domain = document.domain;if (window.top !== window.self) window.location = 'about:blank';var OB_startTime = new Date().getTime(); var OB_loadErrors = []; function _onerror(e) { OB_loadErrors.push(e) }; if (document.addEventListener) document.addE

In [34]:
medium_df = pd.DataFrame(data, columns=['id', 'url', 'title', 'subtitle', 'image', 'claps', 'responses', 'reading_time', 'publication', 'date'])


In [35]:
medium_df.head(12)

Unnamed: 0,id,url,title,subtitle,image,claps,responses,reading_time,publication,date
0,1,https://medium.com/netranews/%E0%A6%AF%E0%A7%8...,যুক্তরাজ্যের নতুন অবরোধ বিধান প্রয়োগ হতে পারে ...,,1.png,16,0,4,Bangla Medium archieve,2020-07-09
1,2,https://medium.com/%E0%A6%AA%E0%A7%8D%E0%A6%B0...,লারাভেল Dynamic Database Connection : এক প্রোজ...,,2.jpg,17,0,4,Bangla Medium archieve,2020-07-09
2,3,https://medium.com/@arnob-das/%E0%A6%AA%E0%A7%...,প্রোগ্রামিং শিখতে আপনার যা জানা দরকার,,3.jpeg,0,0,6,Bangla Medium archieve,2020-07-20
3,4,https://medium.com/@avijitdas826/%E0%A6%AA%E0%...,পরিবর্তন,,,0,0,1,Bangla Medium archieve,2020-05-04
4,5,https://medium.com/@lyricscart.in/je-kawta-din...,Je Kawta Din Reprise Lyrics in Bengali-Dwitiyo...,,5.jpeg,1,0,1,Bangla Medium archieve,2020-02-20
5,6,https://medium.com/@maqsud3/%E0%A6%B2%E0%A6%BE...,লাল চোখে ঘুম লুকিয়ে,,,0,0,1,Bangla Medium archieve,2020-02-20
6,7,https://medium.com/@hamimseth/something-to-smi...,Something to Smile About,,7.jpeg,2,0,3,Bangla Medium archieve,2020-08-25
7,8,https://medium.com/@premproloisohoobostan/%E0%...,"আত্মীয় COVID-19, আত্মার আত্মীয় NHS",,8.jpeg,0,0,2,Bangla Medium archieve,2020-06-02
8,9,https://medium.com/%E0%A6%AA%E0%A7%8D%E0%A6%B0...,ডিপ লার্নিং বেসিক-পর্ব ১,,9.png,42,0,5,Bangla Medium archieve,2020-07-18
9,10,https://medium.com/%E0%A6%AA%E0%A7%8D%E0%A6%B0...,ওয়েব ক্রলিং : with Javascript & Browser Console,,10.jpg,25,0,6,Bangla Medium archieve,2020-07-18
