In [1]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import json
import pandas as pd
import numpy as np
from datetime import date, datetime, timedelta
import os
import re

In [2]:
eenadu_archives = 'https://www.eenadu.net/archivesdet/home/'

In [3]:
category_links = {
    'eenadu_cinema' : 'https://www.eenadu.net/archivespage/cinemamore/', 
    'eenadu_sports' : 'https://www.eenadu.net/archivespage/sportsmore/', 
    'eenadu_crime' : 'https://www.eenadu.net/archivespage/crimemore/',
    'eenadu_business' : 'https://www.eenadu.net/archivespage/businessmore/',
    'eenadu_national' : 'https://www.eenadu.net/archivespage/nationalmore/'}

# class="article-box-list no-space-t no-space-b"

In [4]:
def readout_buffer(response):
    response.text = response.read()
    return response.text.decode('utf-8')

def get_data_from_url(url):
    try:
        r = urlopen(url)
        doc = readout_buffer(r)
    except Exception as e:
        print(e)
        doc = ""
    return doc

In [5]:
def save_json(d, path):
    with open(path, 'w') as fp:
        json.dump(d, fp)
    
def read_json(path):
    with open(path, 'r') as fp:
        return json.load(fp)

### Scrape links

In [6]:
# news_urls = dict.fromkeys(category_links.keys(), [])
news_urls = {k:[] for k in category_links.keys()}
N_days = 365
for i in range(0, N_days):
    current_date = date.today() - timedelta(days = i)
    current_date = current_date.strftime('%d-%m-%Y')
    for key in category_links:
        current_category_url = category_links[key] + current_date
        # open the main link
        doc = get_data_from_url(current_category_url)
        # parse html
        soup = BeautifulSoup(doc, 'html.parser')
        # print(soup)
        # filter body
        div = soup.find('ul', {'class': "article-box-list no-space-t no-space-b"})
        # print(div)
        # find all ahref tags
        if div:
            anchors = div.find_all('a');
            anchor_links = [anchor['href'] for anchor in anchors]
            news_urls[key].extend(anchor_links)

In [7]:
for key, values in news_urls.items():
    print(key, '--->', len(set(values)))

eenadu_cinema ---> 3031
eenadu_sports ---> 5852
eenadu_crime ---> 3241
eenadu_business ---> 4273
eenadu_national ---> 5022


In [8]:
save_json(news_urls, "links.json")

### Scrape Pages from Links & Parse html

In [9]:
!pip install selectolax
from selectolax.parser import HTMLParser
def get_details(url):
    doc = get_data_from_url(url)
    try: 
        html_doc = HTMLParser(doc)
        t = '\n '.join(n.text() for n in html_doc.css("title"))
        a = '\n '.join(n.text() for n in html_doc.css("span.text-justify"))
    except:
        t = ""
        a = ""
    return [t, a]

Collecting selectolax
[?25l  Downloading https://files.pythonhosted.org/packages/b9/6d/ad7ae4b4be8d43799019d5d4312b82cddf2540bc4334be6c327d8d7dc6c4/selectolax-0.2.3-cp36-cp36m-manylinux2010_x86_64.whl (1.7MB)
[K     |████████████████████████████████| 1.7MB 3.0MB/s 
[?25hInstalling collected packages: selectolax
Successfully installed selectolax-0.2.3


In [10]:
from datetime import datetime
import multiprocessing as mp
import multiprocessing.dummy as mpd
import time

start = datetime.now()
cpu_cores = mp.cpu_count()
print('parallelising the task on {} cpu cores'.format(cpu_cores))

count = 0
# divide pool
pool = mpd.Pool(processes=cpu_cores)

# to store (title, article, category)
data_rows = []

# iter over
for key, url_list in news_urls.items(): 
    for row in pool.imap(get_details, url_list):
        row.append(key)
        data_rows.append(row)
    # print/save
    print("Done for {} ({}) ---> {}".format(key, len(data_rows), datetime.now() - start))
# close the pool
pool.close()
pool.join()

parallelising the task on 4 cpu cores
Done for eenadu_cinema (3031) ---> 0:10:38.194344
Done for eenadu_sports (8883) ---> 0:29:58.885262
Done for eenadu_crime (12124) ---> 0:41:11.971790
Done for eenadu_business (16397) ---> 0:56:13.233728
Done for eenadu_national (21419) ---> 1:12:48.094196


In [11]:
df = pd.DataFrame(data_rows, columns = ['title', 'text', 'category'])
df.to_parquet('telugu_news_dataset.parquet', index = None)
df.shape, df.columns

((21419, 3), Index(['title', 'text', 'category'], dtype='object'))

In [12]:
df['category'].value_counts()

eenadu_sports      5852
eenadu_national    5022
eenadu_business    4273
eenadu_crime       3241
eenadu_cinema      3031
Name: category, dtype: int64

In [13]:
df.head()

Unnamed: 0,title,text,category
0,ఆల్‌ టైమ్‌ రికార్డ్‌గా బన్నీ సినిమా,\n రూ.220 కోట్ల వసూళ్లతో దూసుకు...,eenadu_cinema
1,వెంకీ టైటిల్‌ ఇదేనా?,"\n \n\n‘ఎఫ్‌ 2’, ‘వెంకీ మామ’ వి...",eenadu_cinema
2,‘ఆర్‌ ఆర్‌ ఆర్‌’ విలన్‌ వచ్చాడు,\n \n\nతన చిత్రాల్లో కథానాయకుడి...,eenadu_cinema
3,ఆయన ముందు నేనెంత?,"\n \n\nచిత్రసీమలో క్రమశిక్షణ, స...",eenadu_cinema
4,పూరి చేతుల మీదుగా,\n \n\nనాగశౌర్య హీరోగా నటించిన ...,eenadu_cinema


In [14]:
# def get_details(doc):
# """
# super slow parsing
# """
#     try: 
#         soup = BeautifulSoup(doc, 'html.parser')
#         # print(soup.title.string)
#         paras = soup.find_all('p')
#         a = ' \n'.join([para.text for para in paras])
#         t = soup.title.string
#     except:
#         t = ""
#         a = ""
#     return t, a