## Project Tilte: Implement news paper scraper and create ML pipeline

### Part 1: Web scaping

In [1]:
import pandas as pd
import glob
import os
import itertools
import requests
import os.path as path
import numpy as np
import re

from functools import reduce
from bs4 import BeautifulSoup
from datetime import datetime

In [2]:
if path.exists('../input_data/news_paper_data.csv'):
    df = pd.read_csv('../input_data/news_paper_data.csv')
else:
    columns = ['node_id','title', 'date', 'url','category', 'article']
    df = pd.DataFrame(columns=columns)

In [3]:
df.head()

Unnamed: 0,node_id,title,date,url,category,article
0,3074331,Consumer rights body’s oil price monitoring st...,19-07-2022,/business/news/consumer-rights-bodys-oil-price...,business,The Directorate of National Consumer Rights Pr...
1,3074126,Chattogram shares bleed for another day,19-07-2022,/business/news/chattogram-shares-bleed-another...,business,Shares on the Chittagong Stock Exchange (CSE) ...
2,3073706,bKash’s instant add money service at Dev Bank,19-07-2022,/business/organisation-news/news/bkashs-instan...,business,Clients of Bangladesh Development Bank (BDB) c...
3,3074291,Textile millers demand 360 days for deferred L...,19-07-2022,/business/news/textile-millers-demand-360-days...,business,The leaders of Bangladesh Textile Mills Associ...
4,3074276,"Tk 25,000 crore refinance scheme unveiled for ...",19-07-2022,/business/news/tk-25000-crore-refinance-scheme...,business,"Bangladesh Bank today unveiled a Tk 25,000 cro..."


In [4]:
print(f'Check previous data: {len(df)}')

Check previous data: 164


In [5]:
# base url of the news paper
base_url = 'https://www.thedailystar.net'

# all important categories
categories = ['business', 'sports', 'entertainment']

# counts new entry using this run
new_count = 0

for category in categories:
    
    #print(base_url+category)
    page = requests.get(base_url+'/'+category)
    
    soup = BeautifulSoup(page.content, 'html.parser')
    
    # to analyze the newspaper I got all the headline of intended categories are under h2 or h3 tag
    temp_h2 = soup.find_all('h2')
    temp_h3 = soup.find_all('h3')

    all_url = temp_h2 + temp_h3

    for data in all_url:
        temp_d = data.find('a')

        url = str(temp_d.get('href'))
        title = temp_d.get_text()

        # check its on the category or not
        url_arr = url.split('/')

        url_arr = [d for d in url_arr if len(d.strip())>0]

        if category in url_arr[0]:

            # check url has a node id or not
            last_part = str(url_arr[len(url_arr)-1])

            if '-' in last_part:
                lp_arr = last_part.split('-')
                node_id = str(lp_arr[len(lp_arr)-1])

                if node_id.isnumeric():
                    if node_id not in set(df.node_id):
                        new_count += 1
                        df = df.append({'node_id': node_id, 'title': title, 'url': url, 'category': category}, 
                                       ignore_index=True)

            # is else part we can get the sub-categories
df.head()

Unnamed: 0,node_id,title,date,url,category,article
0,3074331,Consumer rights body’s oil price monitoring st...,19-07-2022,/business/news/consumer-rights-bodys-oil-price...,business,The Directorate of National Consumer Rights Pr...
1,3074126,Chattogram shares bleed for another day,19-07-2022,/business/news/chattogram-shares-bleed-another...,business,Shares on the Chittagong Stock Exchange (CSE) ...
2,3073706,bKash’s instant add money service at Dev Bank,19-07-2022,/business/organisation-news/news/bkashs-instan...,business,Clients of Bangladesh Development Bank (BDB) c...
3,3074291,Textile millers demand 360 days for deferred L...,19-07-2022,/business/news/textile-millers-demand-360-days...,business,The leaders of Bangladesh Textile Mills Associ...
4,3074276,"Tk 25,000 crore refinance scheme unveiled for ...",19-07-2022,/business/news/tk-25000-crore-refinance-scheme...,business,"Bangladesh Bank today unveiled a Tk 25,000 cro..."


In [6]:
print(f'Total Number of data points are: {len(df)}')
print(f'New data point added: {new_count}')

Total Number of data points are: 245
New data point added: 81


In [7]:
df.tail()

Unnamed: 0,node_id,title,date,url,category,article
240,3075111,The Colors of Youth: celebrating art across co...,,/entertainment/theatre-arts/news/the-colors-yo...,entertainment,
241,3074206,Portraying the bucolic beauty of Bangladesh,,/entertainment/theatre-arts/news/portraying-th...,entertainment,
242,3074006,Prachyanat to stage ‘Dumurkheko Manush’ today,,/entertainment/theatre-arts/news/prachyanat-st...,entertainment,
243,3066871,‘Beckoning Horizon’: Azmeer Hossain’s exhibit...,,/entertainment/theatre-arts/news/beckoning-hor...,entertainment,
244,3065521,Drik hosts regional photo contest in partnersh...,,/entertainment/theatre-arts/news/drik-hosts-re...,entertainment,


In [8]:
def article_parser(soup, node_id):
    
    article_context = soup.find(id="node-"+node_id)

    article = ''
    only_date = ''

    try:
        dt_context = article_context.find("div", {"class": "date text-10"}).get_text()
        dt_input_arr = str(dt_context).split('Last update on:')
        dt_input = str(dt_input_arr[1]).strip().replace(',', '')

        dt_arr = dt_input.split(' ')
        only_date = dt_arr[1] + '-' + dt_arr[2] + '-' + dt_arr[3]

        only_date = datetime.strptime(only_date, '%b-%d-%Y').strftime('%d-%m-%Y')
        
    except Exception as ex:
        print(f'Error: {str(ex)}')

    for paragraph in article_context.find_all('p'):
        text = paragraph.get_text()
        
        if len(text) > 0:
            article = article + ' ' + text
    
    return only_date, article.strip()

In [11]:
date_list = []
article_list = []

for index, row in df.iterrows():
    node_id = row['node_id']
    
    if row['article'] in (None, '', np.NaN, np.nan):
        
        print(f'Running {index+1}th obserbation, Fetching data for article id: {node_id} ... ... ...')
        
        #hit to get a particular article
        new_page = requests.get(base_url+row['url'])
        
        # get the content of the article
        current_soup = BeautifulSoup(new_page.content, 'html.parser')

        results = article_parser(current_soup, node_id)
        
        df.at[index, 'date'] = results[0]
        df.at[index, 'article'] = results[1]

Running 165th obserbation, Fetching data for article id: 3076556 ... ... ...
Running 166th obserbation, Fetching data for article id: 3076551 ... ... ...
Running 167th obserbation, Fetching data for article id: 3076416 ... ... ...
Running 168th obserbation, Fetching data for article id: 3076546 ... ... ...
Running 169th obserbation, Fetching data for article id: 3076541 ... ... ...
Running 170th obserbation, Fetching data for article id: 3076531 ... ... ...
Running 171th obserbation, Fetching data for article id: 3076526 ... ... ...
Running 172th obserbation, Fetching data for article id: 3076516 ... ... ...
Running 173th obserbation, Fetching data for article id: 3076501 ... ... ...
Running 174th obserbation, Fetching data for article id: 3076491 ... ... ...
Running 175th obserbation, Fetching data for article id: 3076206 ... ... ...
Running 176th obserbation, Fetching data for article id: 3076476 ... ... ...
Running 177th obserbation, Fetching data for article id: 3076471 ... ... ...

In [12]:
df.head()

Unnamed: 0,node_id,title,date,url,category,article
0,3074331,Consumer rights body’s oil price monitoring st...,19-07-2022,/business/news/consumer-rights-bodys-oil-price...,business,The Directorate of National Consumer Rights Pr...
1,3074126,Chattogram shares bleed for another day,19-07-2022,/business/news/chattogram-shares-bleed-another...,business,Shares on the Chittagong Stock Exchange (CSE) ...
2,3073706,bKash’s instant add money service at Dev Bank,19-07-2022,/business/organisation-news/news/bkashs-instan...,business,Clients of Bangladesh Development Bank (BDB) c...
3,3074291,Textile millers demand 360 days for deferred L...,19-07-2022,/business/news/textile-millers-demand-360-days...,business,The leaders of Bangladesh Textile Mills Associ...
4,3074276,"Tk 25,000 crore refinance scheme unveiled for ...",19-07-2022,/business/news/tk-25000-crore-refinance-scheme...,business,"Bangladesh Bank today unveiled a Tk 25,000 cro..."


In [13]:
df.tail()

Unnamed: 0,node_id,title,date,url,category,article
240,3075111,The Colors of Youth: celebrating art across co...,20-07-2022,/entertainment/theatre-arts/news/the-colors-yo...,entertainment,"""The Colors of Youth"", an exhibition run by Ab..."
241,3074206,Portraying the bucolic beauty of Bangladesh,21-07-2022,/entertainment/theatre-arts/news/portraying-th...,entertainment,Renowned artist Samar Majumder's third solo ex...
242,3074006,Prachyanat to stage ‘Dumurkheko Manush’ today,19-07-2022,/entertainment/theatre-arts/news/prachyanat-st...,entertainment,The certificate distribution ceremony for the ...
243,3066871,‘Beckoning Horizon’: Azmeer Hossain’s exhibit...,08-07-2022,/entertainment/theatre-arts/news/beckoning-hor...,entertainment,"Azmeer Hossain's fifth solo exhibition ""Beckon..."
244,3065521,Drik hosts regional photo contest in partnersh...,06-07-2022,/entertainment/theatre-arts/news/drik-hosts-re...,entertainment,The World Press Photo Foundation and Banglades...


In [14]:
df.to_csv('../input_data/news_paper_data.csv', index=False)