In [1]:
import os
import time
import requests
import numpy as np
import pandas as pd
from tqdm import tqdm
import multiprocessing
from bs4 import BeautifulSoup

headers = {
    'Accept-Encoding': 'gzip, deflate, sdch',
    'Accept-Language': 'en-US,en;q=0.8',
    'Upgrade-Insecure-Requests': '1',
    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Cache-Control': 'max-age=0',
    'Connection': 'keep-alive',
}

### 1. Scraping Article Links

In [None]:
data = []
not_working_links = []
pref = 'https://www.geeksforgeeks.org/'

categories = ['basic','easy','medium','hard','expert']

for cat in categories:                                        # Going through each Category

  link = pref + cat
  r = requests.get(link, headers=headers, allow_redirects=False)
  soup = BeautifulSoup(r.text, 'html.parser')
  pages = int(soup.find_all('a', class_ = 'page')[-2].text)

  for page in tqdm(range(1,pages + 1)):                             # Going through each page

    link_ = link + '/' +  str(page)

    try:
      r = requests.get(link_, headers=headers, allow_redirects=False)
      sp = BeautifulSoup(r.text, 'html.parser')
      articles = sp.find('div', class_ = 'articles-list').find_all('div', class_ = 'content')
      for article in articles:

        title_ar = article.find('div', class_ = 'head').text.strip()
        link_ar  = article.find('div', class_ = 'head').find('a').get('href')
        tags     = ','.join([ ar.find('a').text.strip() for ar in article.find('div', class_ = 'tags-list').find_all('div',class_ = 'tags-list_item')])
        data.append([title_ar,link_ar,tags])
    except:
      not_working_links.append(link_)

  pd.DataFrame(data, columns = ['title','link','tags']).to_csv('data.csv', index = False)

100%|██████████| 646/646 [26:50<00:00,  2.49s/it]
100%|██████████| 689/689 [48:16<00:00,  4.20s/it]
100%|██████████| 696/696 [53:49<00:00,  4.64s/it]
100%|██████████| 291/291 [21:12<00:00,  4.37s/it]
100%|██████████| 156/156 [09:31<00:00,  3.66s/it]


### 2. Defining Function to Scrape Article Data

In [None]:
df = pd.read_csv('data.csv')

def download(st,nd,df):
  data = []

  for link in tqdm(df['link'][st:nd]):

    r = requests.get(link, headers=headers, allow_redirects=False)
    soup = BeautifulSoup(r.text, 'html.parser')
    try:
      id_ = [_ for _ in soup.find('body').get('class') if 'postid' in _][0].split('-')[-1]
    except:
      id_ = np.nan
    try:
      title = soup.find('h1').text.strip()
    except:
      title = np.nan
    try:
      text = soup.find('article').find('div', class_ = 'text').text.strip()
    except:
      text = np.nan
    try:
      author_name = soup.find('div', class_ = 'name').text.strip()
      author_id   = soup.find('div', class_ = 'name').find('a').get('href').split('/')[4].strip()
    except:
      author_name = np.nan
      author_id   = np.nan
    try:
      tags = ','.join([_.text for _ in soup.find('div', class_ = 'improved').find_all('li')])
    except:
      tags = np.nan
    try:
      img_links   = ','.join([_.get('src') for _ in soup.find('article').find('div', class_ = 'text').find_all('img')])
      no_of_img   = len([_.get('src') for _ in soup.find('article').find('div', class_ = 'text').find_all('img')])
    except:
      img_links = np.nan
      no_of_img = np.nan
    try:
      file_path = 'articles/' + id_ + '.txt'
      fd = open(file_path, 'w')
      fd.write(text)
      fd.close()
    except:
      file_path = np.nan

    data.append([id_, title, author_name, author_id, tags, no_of_img,file_path,link,img_links])

  df_ = pd.DataFrame(data, columns = ['id','title','author_name','author_id',
                                     'tags','no_of_imgs','file_path','link','img_links'])
  df_.to_csv(str(st) + '_' + str(nd) + '.csv', index = False)

### 3. Scraping Articles Data with MultiProcessing

In [None]:
p1 = multiprocessing.Process(target = download, args = (1 , 5000,df))
p2 = multiprocessing.Process(target = download, args = (5000 , 10000,df))
p3 = multiprocessing.Process(target = download, args = (10000 , 15000,df))
p4 = multiprocessing.Process(target = download, args = (15000 , 20000,df))
p5 = multiprocessing.Process(target = download, args = (20000 , 25000,df))
p6 = multiprocessing.Process(target = download, args = (25000 , 30000,df))
p7 = multiprocessing.Process(target = download, args = (30000 , 35000,df))
p8 = multiprocessing.Process(target = download, args = (35000 , 40000,df))
p9 = multiprocessing.Process(target = download, args = (40000 , 45000,df))
p10 = multiprocessing.Process(target = download, args = (45000 , len(df),df))

p1.start()
p2.start()
p3.start()
p4.start()
p5.start()
p6.start()
p7.start()
p8.start()
p9.start()
p10.start()

p1.join()
p2.join()
p3.join()
p4.join()
p5.join()
p6.join()
p7.join()
p8.join()
p9.join()
p10.join()

  soup = BeautifulSoup(r.text, 'html.parser')
 32%|███▏      | 2587/8000 [1:07:32<3:26:00,  2.28s/it]

### 4. Combining the Dataset

In [None]:
csvs = [i for i in os.listdir( ) if '.csv' in i]

df_ = pd.DataFrame()

for csv in csvs:
  df = pd.read_csv(csv)
  df_ = pd.concat((df,df_))

49434

In [None]:
df_.to_csv('final.csv', index = False)