In [1]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import os
import re

In [2]:
home_url = 'https://te.wikipedia.org'
links = ['https://te.wikipedia.org/w/index.php?title=%E0%B0%AA%E0%B1%8D%E0%B0%B0%E0%B0%A4%E0%B1%8D%E0%B0%AF%E0%B1%87%E0%B0%95:%E0%B0%85%E0%B0%A8%E0%B1%8D%E0%B0%A8%E0%B0%BF%E0%B0%AA%E0%B1%87%E0%B0%9C%E0%B1%80%E0%B0%B2%E0%B1%81&from=2014+%E0%B0%86%E0%B0%82%E0%B0%A7%E0%B1%8D%E0%B0%B0%E0%B0%AA%E0%B1%8D%E0%B0%B0%E0%B0%A6%E0%B1%87%E0%B0%B6%E0%B1%8D+%E0%B0%B8%E0%B0%BE%E0%B0%B0%E0%B1%8D%E0%B0%B5%E0%B0%A4%E0%B1%8D%E0%B0%B0%E0%B0%BF%E0%B0%95+%E0%B0%8E%E0%B0%A8%E0%B1%8D%E0%B0%A8%E0%B0%BF%E0%B0%95%E0%B0%B2%E0%B1%81']

### Scrape links

In [3]:
def readout_buffer(response):
    response.text = response.read()
    return response.text.decode('utf-8')

def get_data_from_url(url):
    try:
        r = urlopen(url)
        doc = readout_buffer(r)
    except Exception as e:
        print(e)
        doc = ""
    return doc

In [4]:
all_links = []
# Main code
prev_len = 0
for link in links:  
    while link:
        html_doc = ''
        # open the main link
        doc = get_data_from_url(link)
        # parse html
        soup = BeautifulSoup(doc, 'html.parser')
        # filter body
        div = soup.find('div',{'class':'mw-allpages-body'})
        # find all ahref tags
        if div:
            anchors = div.find_all('a');
            all_links = all_links + [home_url + anchor['href'] for anchor in anchors]
        # if no hrefs found break the loop
        if prev_len == len(set(all_links)):
            break
        # find the navigation div
        nav_div = soup.find('div',{'class':'mw-allpages-nav'})
        if nav_div and len(nav_div.find_all('a')) == 2:
            link = home_url + nav_div.find_all('a')[1]['href']
        prev_len = len(set(all_links))
        
len(all_links), len(set(all_links))

(99089, 99052)

In [5]:
unique_links = list(set(all_links));
unique_links[900]

'https://te.wikipedia.org/wiki/%E0%B0%B0%E0%B0%BE%E0%B0%AE%E0%B0%95%E0%B1%83%E0%B0%B7%E0%B1%8D%E0%B0%A3_%E0%B0%AE%E0%B0%A0%E0%B0%AE%E0%B1%81,_%E0%B0%B9%E0%B1%88%E0%B0%A6%E0%B0%B0%E0%B0%BE%E0%B0%AC%E0%B0%BE%E0%B0%A6%E0%B1%81'

In [6]:
df = pd.DataFrame(unique_links, columns = ['link'])
df.to_csv('telugu_wiki_links.csv', index = None)

### Scrape Pages from Links & Parse html

In [7]:
!pip install selectolax
from selectolax.parser import HTMLParser
def get_details(url):
    doc = get_data_from_url(url)
    try: 
        html_doc = HTMLParser(doc)
        t = '\n '.join(n.text() for n in html_doc.css("title"))
        a = '\n '.join(n.text() for n in html_doc.css("p"))
    except:
        t = ""
        a = ""
    return t, a

Collecting selectolax
[?25l  Downloading https://files.pythonhosted.org/packages/b9/6d/ad7ae4b4be8d43799019d5d4312b82cddf2540bc4334be6c327d8d7dc6c4/selectolax-0.2.3-cp36-cp36m-manylinux2010_x86_64.whl (1.7MB)
[K     |████████████████████████████████| 1.7MB 2.2MB/s 
[?25hInstalling collected packages: selectolax
Successfully installed selectolax-0.2.3


In [8]:
from datetime import datetime
import multiprocessing as mp
import multiprocessing.dummy as mpd
import time

start = datetime.now()
cpu_cores = mp.cpu_count()
print('parallelising the task on {} cpu cores'.format(cpu_cores))

rows = []
count = 0
# divide pool
pool = mpd.Pool(processes=cpu_cores)
# iter over 
for row in pool.imap(get_details, unique_links):
    rows.append(row)    
    count = count + 1
    # print/save
    if not count%10000:
        df = pd.DataFrame(rows, columns = ['title', 'text'])
        df.to_parquet('telugu_wikipedia_dataset.parquet', index = None)
        print("Done for {} rows ---> {}".format(count, datetime.now() - start))
# close the pool
pool.close()
pool.join()

parallelising the task on 4 cpu cores
Done for 10000 rows ---> 0:09:35.897648
Done for 20000 rows ---> 0:19:05.904842
Done for 30000 rows ---> 0:28:34.470181
Done for 40000 rows ---> 0:38:00.445309
Done for 50000 rows ---> 0:47:26.622419
Done for 60000 rows ---> 0:56:52.028604
Done for 70000 rows ---> 1:06:09.112798
Done for 80000 rows ---> 1:15:30.016545
Done for 90000 rows ---> 1:24:49.169892


In [9]:
df = pd.DataFrame(rows, columns = ['title', 'text'])
df.to_parquet('telugu_wikipedia_dataset.parquet')
df.shape, df.columns

((99052, 2), Index(['title', 'text'], dtype='object'))

In [10]:
df.head()

Unnamed: 0,title,text
0,వేములకొండ - వికీపీడియా,"\n\n \n\n వేములకొండ, తూర్పు గోదావరి జిల్లా, రం..."
1,గంగవరం (కోవూరు) - వికీపీడియా,"గంగవరం ఆంధ్ర ప్రదేశ్ రాష్ట్రం, శ్రీ పొట్టి శ్ర..."
2,అన్నదమ్ముల శపధం - వికీపీడియా,
3,సోమిదేవిపల్లి - వికీపీడియా,"సోమిదేవిపల్లి, ప్రకాశం జిల్లా, రాచర్ల మండలానిక..."
4,పెద్దపాడు (శ్రీకాకుళం మండలం) - వికీపీడియా,"పెద్దపాడు శ్రీకాకుళం జిల్లా, శ్రీకాకుళం మండలం ..."


In [11]:
# def get_details(doc):
# """
# super slow parsing
# """
#     try: 
#         soup = BeautifulSoup(doc, 'html.parser')
#         # print(soup.title.string)
#         paras = soup.find_all('p')
#         a = ' \n'.join([para.text for para in paras])
#         t = soup.title.string
#     except:
#         t = ""
#         a = ""
#     return t, a