# Web Scraping from wikipedia

## API

Documentation here <https://pypi.org/project/Wikipedia-API/>

In [2]:
from typing import Dict, List
from wikipediaapi import Wikipedia

class WikiDict(Wikipedia):
    def __init__(self) -> None:
        super().__init__(language='es')

    def page_to_dict(self, page: str, feature_list=['title','summary','fullurl','sections','links']) -> Dict:
        search = self.page(page)
        page_dict = {}
        for feature in feature_list:
            page_dict[feature] = eval(f'search.{feature}')
        return page_dict

wiki = WikiDict()

In [37]:
import pandas as pd

df = pd.DataFrame({'Program':['Python','SQL','JavaScript']})
df

Unnamed: 0,Program
0,Python
1,SQL
2,JavaScript


In [38]:
df['wiki'] = df['Program'].map(wiki.page_to_dict)
df

Unnamed: 0,Program,wiki
0,Python,"{'title': 'Python', 'summary': 'Python es un l..."
1,SQL,"{'title': 'SQL', 'summary': 'SQL (por sus sigl..."
2,JavaScript,"{'title': 'JavaScript', 'summary': 'JavaScript..."


In [45]:
df = df.join(pd.DataFrame(df['wiki'].tolist(), index=df.index))
df

Unnamed: 0,Program,wiki,title,summary,fullurl,sections,links
0,Python,"{'title': 'Python', 'summary': 'Python es un l...",Python,Python es un lenguaje de programación interpre...,https://es.wikipedia.org/wiki/Python,[Section: Historia (1):\nPython fue creado a f...,{'ABC (lenguaje de programación)': ABC (lengua...
1,SQL,"{'title': 'SQL', 'summary': 'SQL (por sus sigl...",SQL,SQL (por sus siglas en inglés Structured Query...,https://es.wikipedia.org/wiki/SQL,[Section: Orígenes y evolución (1):\nLos oríge...,"{'1986': 1986 (id: ??, ns: 0), '1989': 1989 (i..."
2,JavaScript,"{'title': 'JavaScript', 'summary': 'JavaScript...",JavaScript,JavaScript (abreviado comúnmente JS) es un len...,https://es.wikipedia.org/wiki/JavaScript,[Section: Historia (1):\n\nSubsections (3):\nS...,"{'AJAX': AJAX (id: ??, ns: 0), 'Accesibilidad ..."


## Webscraping

In [None]:
# Import Module
from bs4 import *
import requests
 
# Given URL
url = "https://en.wikipedia.org/wiki/Beautiful_Soup_(HTML_parser)"
 
# Fetch URL Content
r = requests.get(url)
 
# Get body content
soup = BeautifulSoup(r.text,'html.parser').select('body')[0]
 
# Initialize variable
paragraphs = []
images = []
link = []
heading = []
remaining_content = []
 
# Iterate throught all tags
for tag in soup.find_all():
     
    # Check each tag name
    # For Paragraph use p tag
    if tag.name=="p":
       
        # use text for fetch the content inside p tag
        paragraphs.append(tag.text)
         
    # For Image use img tag
    elif tag.name=="img":
       
        # Add url and Image source URL
        images.append(url+tag['src'])
         
    # For Anchor use a tag
    elif tag.name=="a":
       
        # convert into string and then check href
        # available in tag or not
        if "href" in str(tag):
           
          # In href, there might be possible url is not there
          # if url is not there
            if "https://en.wikipedia.org/w/" not in str(tag['href']):
                link.append(url+tag['href'])
            else:
                link.append(tag['href'])
                 
    # Similarly check for heading
    # Six types of heading are there (H1, H2, H3, H4, H5, H6)
    # check each tag and fetch text
    elif "h" in tag.name:
        if "h1"==tag.name:
            heading.append(tag.text)
        elif "h2"==tag.name:
            heading.append(tag.text)
        elif "h3"==tag.name:
            heading.append(tag.text)
        elif "h4"==tag.name:
            heading.append(tag.text)
        elif "h5"==tag.name:
            heading.append(tag.text)
        else:
            heading.append(tag.text)
             
    # Remain content will store here
    else:
        remaining_content.append(tag.text)
         
print(paragraphs, images, link, heading, remaining_content)