# Web Scraping with Beautifulsoup

### Importing the packages

In [None]:
# Load the packages required
import requests
from bs4 import BeautifulSoup

### Making a GET request

In [None]:
# Defining the url of the site
base_site = "https://www.babnet.net/"

# Making a get request
response = requests.get(base_site)
#response.status_code : if 200 the everything is OK

### Extracting the HTML content

In [None]:
# Extracting the HTML
html = response.content

# Checking that the reply is indeed an HTML code by inspecting the first 100 symbols
#html[:100]

### Making the soup

In [None]:
# Convert HTML to a BeautifulSoup object. This will allow us to parse out content from the HTML more easily.
# Using the default parser as it is included in Python
soup = BeautifulSoup(html, "html.parser")

### Exporting the HTML to a file

In [None]:
# It is extremely useful to be able to check this file when searching where some info is located
# or to see how was the document parsed
# Exporting the HTML to a file

##with open('C:/Users/FAHD/Desktop/Formation/Web Scraping/AI Squad/BabNet/babnet.html', 'wb') as file:
##    file.write(soup.prettify('utf-8'))

# the 'with' statement is shorthand for a 'try-finally' block
# open is function for opening/creating a file to edit
# the 'wb' argument signifies the mode in which to edit the file - Writing in Bytes format
# .prettify() modifies the HTML code with additional indentations for better readability

# Searching and navigating the HTML tree

In [None]:
# We can search for tags based on their attributes, in addition to their name
x1 = soup.find_all('h2', class_ = 'post-title arabi')

In [None]:
x2 = soup.find_all('h2', class_ = 'post-title title-medium arabi')

In [None]:
x3 = soup.find_all('h2', class_ = 'post-title title-small arabi')

In [None]:
x4 = x1 + x2 + x3

In [None]:
# Create e beautifulsoup object from our list 
aa = BeautifulSoup(str(x4), "html.parser")

In [None]:
#Let us check the type of our variable
#type(aa)

In [None]:
# Extract all links
links = aa.find_all('a')

In [None]:
# We obtained relative URLs
# To obtain the absolute URL address we will use urljoin
from urllib.parse import urljoin
relative_urls = [link.get('href') for link in links]

In [None]:
# Transforming to absolute path URLs
full_urls = [urljoin(base_site, url) for url in relative_urls]

## Scraping multiple pages automatically - Extracting all the text from the URLs

In [None]:
# The objective is to get all the useful text from those pages
# We will do that by extracting all text contained in a paragraph element,
# for all paragraphs on a page,
# for all pages (in full_urls)

In [None]:
#to avoid limiting the number of requests, we should tellpython to wait between each request
#we importtime library
import time

In [None]:
# initialize lists to store titles and articles for each webpage
article = []
titles = []


# creating a loop counter
i = 0

print('-------------------------- Beginning of Scraping --------------------------')

# Loop through each URL in note_urls
for url in full_urls:
    
    #wait 1 seconde between requests
    #time.sleep(1)
    
    # connect to every webpage
    note_resp = requests.get(url)
    
    
    # checking if the request is successful
    if note_resp.status_code == 200:            # Everything is OK!
        print('URL #{0}: {1}'.format(i+1,url))    # print out the number of iteration and the URL to keep track of place in loop
    
    else:                                       # Something is wrong!
        print('Status code {0}: Skipping URL #{1}: {2}'.format(note_resp.status_code, i+1, url))
        i = i+1
        continue
        
    
    # get HTML from webpage
    note_html = note_resp.content
    
    # convert HTML to BeautifulSoup object
    note_soup = BeautifulSoup(note_html, 'lxml')
    
    # Removing ads from our pages
    for div in note_soup.find_all("div", {'class':'noprint'}): 
        div.decompose()
        
    # Removing source text
    for src in note_soup.find_all("span", {'class':'source'}): 
        src.decompose()
    
    # find  articles on the webpage
    note_pars = note_soup.find_all('div' , class_ = 'entry-content arabi')
    
    # find the title
    note_titles = note_soup.find_all('h2' , class_ = 'post-title arabi')
    
    # Transforming to text and cleaning every desired element in the page
    art = [(p.text).replace('\n','').replace('\t','').replace('\r','') for p in note_pars]
    tit = [(t.text).replace('\n','').replace('\t','').replace('\r','') for t in note_titles]
    
    # Append to our lists
    article.append(str(art)[2:-2])
    titles.append(str(tit)[2:-2])
    
    # Incrementing the loop counter
    i = i+1
    
print('-------------------------- SCRAPING DONE --------------------------')


In [None]:
article[4]

In [None]:
article[3]

In [None]:
# Import pandas to create our dataframe 
import pandas as pd

In [None]:
# Create a dataframe with our scraped data
df = pd.DataFrame(list(zip(full_urls, titles, article)), columns =['link', 'titles', 'article']) 

In [None]:
df

In [None]:
# Create and download the csv file
path = ('C:/Users/FAHD/Desktop/Formation/Web Scraping/AI Squad/BabNet/babnet.csv')
df.to_csv(path, encoding='utf-8-sig')