# Web Scraping from wikipedia

## API

Documentation here <https://pypi.org/project/Wikipedia-API/>

In [18]:
from typing import Dict
from wikipediaapi import Wikipedia

class WikiDict(Wikipedia):
    def __init__(self) -> None:
        super().__init__(language='en')

    def page_to_dict(self, page: str, feature_list=['title','summary','fullurl','sections','links']) -> Dict:
        search = self.page(page)
        page_dict = {}
        for feature in feature_list:
            page_dict[feature] = eval(f'search.{feature}')
        return page_dict

wiki = WikiDict()

In [19]:
import pandas as pd

df = pd.DataFrame({'Program':['Python (programming language)','SQL','JavaScript']})
df

Unnamed: 0,Program
0,Python (programming language)
1,SQL
2,JavaScript


In [20]:
df['wiki'] = df['Program'].map(wiki.page_to_dict)
df

Unnamed: 0,Program,wiki
0,Python (programming language),"{'title': 'Python (programming language)', 'su..."
1,SQL,"{'title': 'SQL', 'summary': 'SQL ( (listen) S-..."
2,JavaScript,"{'title': 'JavaScript', 'summary': 'JavaScript..."


In [21]:
df = df.join(pd.DataFrame(df['wiki'].tolist(), index=df.index))
df

Unnamed: 0,Program,wiki,title,summary,fullurl,sections,links
0,Python (programming language),"{'title': 'Python (programming language)', 'su...",Python (programming language),Python is an interpreted high-level general-pu...,https://en.wikipedia.org/wiki/Python_(programm...,[Section: History (1):\nPython was conceived i...,"{'""Hello, World!"" program': ""Hello, World!"" pr..."
1,SQL,"{'title': 'SQL', 'summary': 'SQL ( (listen) S-...",SQL,"SQL ( (listen) S-Q-L, ""sequel""; Structured Qu...",https://en.wikipedia.org/wiki/SQL,[Section: History (1):\nSQL was initially deve...,"{'.NET Framework': .NET Framework (id: ??, ns:..."
2,JavaScript,"{'title': 'JavaScript', 'summary': 'JavaScript...",JavaScript,"JavaScript (), often abbreviated as JS, is a p...",https://en.wikipedia.org/wiki/JavaScript,[Section: History (1):\n\nSubsections (6):\nSe...,"{'.js': .js (id: ??, ns: 0), '360 Secure Brows..."


## Webscraping

Tutorial here <https://www.geeksforgeeks.org/how-to-extract-wikipedia-data-in-python/>

In [8]:
import re
import requests
from bs4 import BeautifulSoup

class WikiSoup:
    def __init__(self) -> None:
        pass

    def page_to_dict(self, page):
        url = f'https://en.wikipedia.org/wiki/{page}'
        search = BeautifulSoup(requests.get(url).text).select('body')[0]
        paragraphs = []
        images = []
        links = []
        headings = []
        remaining_content = []
        # Iterate throught all tags
        for tag in search.find_all():
            if tag.name=="p": paragraphs.append(tag.text)   

            # For Image use img tag
            elif tag.name=="img": images.append(url+tag['src'])

            # For Anchor use a tag
            elif tag.name=="a":
                # convert into string and then check href available in tag or not
                if "href" in str(tag):
                # In href, there might be possible url is not there if url is not there
                    if "https://en.wikipedia.org/w/" not in str(tag['href']):
                        links.append(url+tag['href'])
                    else: links.append(tag['href'])
                        
            # Similarly check for headings
            elif "h" in tag.name: headings.append(tag.text)
                    
            # Remain content will store here
            else: remaining_content.append(tag.text)
        
        page_dict = {}
        for feature in ['paragraphs','images','links','headings','remaining_content']:
            page_dict[feature] = eval(feature)
        
        return page_dict

wiki = WikiSoup()

In [9]:
import pandas as pd

df = pd.DataFrame({'Program':['Python (programming language)','SQL','JavaScript']})
df

Unnamed: 0,Program
0,Python (programming language)
1,SQL
2,JavaScript


In [10]:
df['wiki'] = df['Program'].map(wiki.page_to_dict)
df

Unnamed: 0,Program,wiki
0,Python (programming language),"{'paragraphs': [' ', 'Python is an interpreted..."
1,SQL,{'paragraphs': ['SQL (/ˌɛsˌkjuːˈɛl/ (listen) S...
2,JavaScript,"{'paragraphs': [' ', 'JavaScript (/ˈdʒɑːvəˌskr..."


In [11]:
df = df.join(pd.DataFrame(df['wiki'].tolist(), index=df.index))
df

Unnamed: 0,Program,wiki,paragraphs,images,links,headings,remaining_content
0,Python (programming language),"{'paragraphs': [' ', 'Python is an interpreted...","[\n, Python is an interpreted high-level gener...",[https://en.wikipedia.org/wiki/Python (program...,[https://en.wikipedia.org/wiki/Python (program...,"[Python (programming language), Paradigm, Desi...","[, , \n\n\n\n\n\nPython (programming language)..."
1,SQL,{'paragraphs': ['SQL (/ˌɛsˌkjuːˈɛl/ (listen) S...,"[SQL (/ˌɛsˌkjuːˈɛl/ (listen) S-Q-L,[4] /ˈsiːkw...",[https://en.wikipedia.org/wiki/SQL//upload.wik...,"[https://en.wikipedia.org/wiki/SQL#mw-head, ht...","[SQL, Paradigm, Family, Designed by, Developer...","[, , \n\n\n\n\nSQL\n\nFrom Wikipedia, the free..."
2,JavaScript,"{'paragraphs': [' ', 'JavaScript (/ˈdʒɑːvəˌskr...","[\n, JavaScript (/ˈdʒɑːvəˌskrɪpt/),[9] often a...",[https://en.wikipedia.org/wiki/JavaScript//upl...,[https://en.wikipedia.org/wiki/JavaScript/wiki...,"[JavaScript, Paradigm, Designed by, First appe...","[, , \n\n\n\n\n\n\nJavaScript\n\nFrom Wikipedi..."
