#Dependencies and Imports

In [None]:
%%capture
!pip install BeautifulSoup
!pip install mwparserfromhell
!pip install wikipedia

In [None]:
import requests
from bs4 import BeautifulSoup
import json
import mwparserfromhell
from collections import deque
import itertools
import pandas as pd
import wikipedia
import random
import time

#Load FAQ Questions

First download the spreadsheet containing the questions. These questions will be used to gather the intial wiki links to start with.<br>
Download the FAQ spreadsheet : https://docs.google.com/spreadsheets/d/1l9yLalzMYzoQZaMc7P3is446En0EzFyeAAJO5Jm1ENA/edit?usp=sharing<br>

In [None]:
df = pd.read_csv('/content/Most Frequently Asked Questions In Nutrition Domain.csv')
queries = df['Questions']

#Gather Intial Wiki Links

Since sending large number of request to google can trigger the ban hammer, we are going to use free available proxies to avoid that.<br> We load up proxies from https://sslproxies.org/<br>

In [None]:
def LoadUpProxies():
	url='https://sslproxies.org/'
	response=requests.get(url)
	soup=BeautifulSoup(response.content, 'lxml')
	return [p for p in soup.select('textarea')[0].contents[0].split('\n\n')[1].split('\n') if not p == '']
proxyBuffer = itertools.cycle(LoadUpProxies())

Now we use the questions from FAQ spreadsheet to gather intial wiki links. <br> We also save iteratively incase a problem arises while in the middle of execution. So, we don't have to start from scratch again.

In [None]:
wiki_titles = []
count = 0

for query in queries:
  print(f"Question Count : {count}")
  query = "https://www.google.com/search?q=" + query + ' wikipedia '
  while True:
      proxy = proxyBuffer.__next__()
      try:
         time.sleep(2)
         page = requests.get(url=query, proxies={"http": proxy})
         soup = BeautifulSoup(page.content, 'html.parser')
         allLinks = soup.find_all("a")
         for link in allLinks:
             link = link.get('href')
             if link is not None:
                 if link.find("/url?q=https://en.wikipedia.org/wiki/") == -1:
                     continue
                 page_title = link[link.find("/wiki/")+6:]
                 page_title = page_title[:page_title.find("&sa")]
                 wiki_titles.append(page_title)
         count+=1
         break
      except:
        print(f'Proxy failed: {proxy}') # proxy failed, try the next one
    
  if count % 10 == 0:
    with open('wiki_titles.txt', 'w') as f:
        for line in wiki_titles:
            f.write(f"{line}\n")

This code can be used to read the saved **wiki_titles.txt** file above.

In [None]:
# For reading the saved titles from the file

# wiki_titles = list()
# with open('wiki_titles.txt', 'r') as wiki_read_file:
#   wiki_titles = wiki_read_file.read().split("\n")

# Scrape Wikipedia
Using the gathered wiki links, we now scrape the data from those links and also scrape the links present on that page. And the cycle continues untill we reach the max article we need or there are no more links.<br><br>
**Note:** Incase the connection to notebook is lost, we don't wanna lose all the progress we have made. So, we are iteratively saving data to google drive. You can opt to save them to your own google drive, or if you are on pc then just put your local path.

In [None]:
title_queue = deque(wiki_titles)
max_articles_to_retrieve = 10000
count = 0
data = []
visited_titles = set()
nonexistant_pages = set()

while title_queue or len(visited_titles) < max_articles_to_retrieve:
  title = title_queue.popleft()

  try:
     page = wikipedia.page(title, auto_suggest=False)
  except wikipedia.DisambiguationError as e: 
     #Sometimes the title is ambiguous so wikipedia instead returns us with options we can choose with. We choose randomly
     s = random.choice(e.options)
     try:
         #Sometimes even the title in options is ambiguous. If the page is found good, otherwise we just insert it in nonexistant_pages
         page = wikipedia.page(s, auto_suggest=False)
     except:
         nonexistant_pages.add(title)
         continue
  except:
     nonexistant_pages.add(title)
     continue
  
  visited_titles.add(title)
  
  try:
    data.append(page.content)
  except:
    a = "Do nothing"

  count+=1
  print(f"Pages Scrapped Count : {count}")

  try:
    more_titles = set(page.links)
  except:
    continue

  more_titles = more_titles - visited_titles
  title_queue.extend(more_titles)

  if count % 50 == 0:

    #----------Change these paths to your paths---------------

    with open('/content/drive/MyDrive/ColabData/wiki_data/wikidata.txt', 'w') as wikiFile:
      wikiFile.write(json.dumps(data))
      wikiFile.close()

    with open('/content/drive/MyDrive/ColabData/wiki_data/wiki_titles_extended.txt', 'w') as f:
      for line in list(title_queue)[1:]:
          f.write(f"{line}\n")
    print("Writing to file...Done")

You can load the saved data using the code below in notebooks where this is needed.

In [None]:
# For Loading up saved data from google drive
with open('/content/drive/MyDrive/ColabData/wiki_data/wikidata.txt', 'r') as wikidata:
  data = json.load(wikidata)