# Scraping blog articles of ChinyBlog.pl with Python and BeautifulSoup

## First, we need to import necessary libraries

In [11]:
import requests
from bs4 import BeautifulSoup

# Let's define a function which will help us to scrap pages
def soup(url):
    response = requests.get(url)
    if response.status_code == 200:
        return BeautifulSoup(response.content, 'html.parser')
    else:
        return "Error"

In [2]:
url = 'https://www.chinyblog.pl/blog/'

# Let's first scrap all links of posts from the blog page of ChinyBlog.pl

## Since posts are dividied into subpages, we need to create a loop to get all aritcles from all subpages.

## As we can see in the url ("https://www.chinyblog.pl/blog/page/1/"),  the subpages can be easily accessed only by changing the number in the url.

In [3]:
# Let's define a list to store html of subpages
subpages_soups = []

# Insert a number of subpages that contains blog articles
subpages_number = 3

for page in range(subpages_number):
    # Scraping subpages
    subpages_soups.append(soup(url + "/page/" + str(page)))
    

# Let's extract all links for articles

In [4]:
# We gonna store all article links in newly created list
article_links = []

# Let's loop through our soups to extract all article links
for soup in subpages_soups:
    # Find all h2 which contain a link to an article
    links = soup.find_all("h2", class_ = "blog-entry-title entry-title")
    
    # Loop through all h2 to extract links only
    for link in links:
        link = link.find("a")['href']
        # Append link to article_links list
        article_links.append(link)

    

In [5]:
article_links[::5]

['https://www.chinyblog.pl/jak-powiedziec-przepraszam-po-chinsku/',
 'https://www.chinyblog.pl/zwyczaj-krepowania-stop-chinski-ideal-piekna/',
 'https://www.chinyblog.pl/chinska-czytanka-w-chinskiej-restauracji/',
 'https://www.chinyblog.pl/nowy-format-hsk/',
 'https://www.chinyblog.pl/jak-sie-przedstawic-po-chinsku-podstawy/',
 'https://www.chinyblog.pl/kuchnia-chinska-przepis-kurczak-kung-pao/']

# Great, we got all articles, so we can scrap each of them now

## Let's scrap each of articles first

In [18]:
articles_soups = []

for link in article_links:

    # Scrap each article
    article_soup = soup(link)

    # Append html to articles_soup list
    articles_soups.append(article_soup)


## Now, let's define a dictionary to store our data

In [40]:
articles_dict = {
    'Title': [],
    'Date': [],
    'Category': [],
    'Content': []
}

## Scraping titles

In [41]:
for soup in articles_soups:
    
    title = soup.find("h1", class_ = "title entry-title").text
    
    # Append title to our dictionary
    articles_dict['Title'].append(title)

## Scraping dates

In [42]:
for soup in articles_soups:
    
    date = soup.find("time", class_ = "entry-date published").text
    
    # Append date to our dictionary
    articles_dict['Date'].append(date)

## Scraping categories

In [43]:
for soup in articles_soups:
    
    category = soup.find("li", class_ = "meta category").text
    
    # Append category to our dictionary
    articles_dict['Category'].append(category)

## Scraping content

In [44]:
for soup in articles_soups:
    
    content = soup.find("div", class_ = "nv-content-wrap entry-content").text
 
    # Append content to our dictionary
    articles_dict['Content'].append(content)

# Let's save our data to excel csv file with the help of Pandas

In [46]:
import pandas as pd

In [51]:
articles_df = pd.DataFrame(articles_dict)
articles_df.head()

Unnamed: 0,Title,Date,Category,Content
0,Jak powiedzieć przepraszam po chińsku,15/06/2020,Język chiński,\nW języku chińskim istnieje wiele sposobów na...
1,Chińska czytanka: W chińskiej restauracji,11/06/2020,Język chiński,\nW serii chińska czytanka będziemy udostępnia...
2,Vlogi do nauki języka chińskiego na YouTube,06/06/2020,"Język chiński, Kultura",\nOglądanie vlogów po chińsku to doskonały spo...
3,Dzień Dziecka w Chinach,01/06/2020,Kultura,\nDzień dziecka podobnie jak w wielu krajach n...
4,Dzień Matki w Chinach,26/05/2020,Kultura,\nDzień Matki podobnie jak w wielu krajach obc...


## Save to csv

In [52]:
articles_df.to_csv("ChinBlog_articles.csv")