In [1]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime,timedelta
import pandas as pd
import re

In [2]:
base_url = "https://www.moneycontrol.com/news/business/economy"

is_appen_on_previous = False

if is_appen_on_previous:
    df = pd.read_csv('News-economy.csv')
    print("Last News on : ",df.iloc[0]['date time'])
    print("Total news : ",len(df))
    
news_data = []

In [3]:
def extractNewsData(news):
    """
    Get insights of news component
    """
    news_json = {}
    
    # Skip ad tag;
    if not news.get('class') or 'clearfix' not in news.get('class'):
        return "ad";
    
    a = news.find('a')
    span = news.find('span')
    
    news_json['link'] = a.get('href')
    news_json['title'] = a.get('title')
    news_json['date time'] = span.text
    
    # Append on previous data
    if is_appen_on_previous and news_json['date time'] == df.iloc[0]['date time']:
        return None
    
    p = news.find_all('p')[0]
    news_json['desc'] = p.text
    news_json['content'] = getFullNewsText(news_json['link'])
#     news_json['stock id'] = getStockId(news_json['link'])
    return news_json

def getFullNewsText(link:str):
    """
    Get Full response or Full News Decription
    """
    response = requests.get(link)
    soup = BeautifulSoup(response.text,'html.parser')  
    try:
        paras = soup.find_all('div',id='contentdata')[0].find_all('p')
        text = ""
        for para in paras:
            text += para.text
    except Exception as err:
        try:
            text = soup.find('p','pro_artidesc').text
        except:
            return 
    
    return text


def fetchPage(page=None,verbose=True):
    """
    Fetch News Entire page and extract the data
    """
    
    url = base_url
    
    # fetching base page
    if page and page >= 2:
        url = f"{base_url}/page-{page}/"
    
    # for processing details
    if(verbose):
        print(url)
        
    response = requests.get(url)   # get new html data
    soup = BeautifulSoup(response.text,'html.parser')  # create soup  
    news_ul = soup.find_all("ul",id="cagetory")[0]   # extract news data conponent
    
    # For Skips Ad Tag
    if not news_ul:
        return True
    
    li = news_ul.find_all('li')  # get list of news
    for index,news in enumerate(li): # iterate news component for get more insights
        
        # logging
        if(verbose):
            print(index)
            
        # extract news insights
        news_json = extractNewsData(news)
        
        # return none if news already in DB not want to duplicate and want not to continue
        if not news_json:
            return None
        if news_json == 'ad':
            continue
        
        # save data
        news_data.append(news_json)
        
    # want not to continue if in the loop   
    return True   
 
# Convert Date string object to Date
def getDateObject(dateString:str,dateFormat='%B %d, %Y %I:%M %p'):
    dt_object = datetime.strptime(dateString, dateFormat)
    return dt_object

In [6]:
for page in range(20,30):
    print("fetching page -- ",page)
    if not fetchPage(page):
        break

fetching page --  20
https://www.moneycontrol.com/news/business/economy/page-20/
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
fetching page --  21
https://www.moneycontrol.com/news/business/economy/page-21/
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
fetching page --  22
https://www.moneycontrol.com/news/business/economy/page-22/
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
fetching page --  23
https://www.moneycontrol.com/news/business/economy/page-23/
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
fetching page --  24
https://www.moneycontrol.com/news/business/economy/page-24/
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
fetching page --  25
https://www.moneycontrol.com/news/business/economy/page-25/
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
2

In [8]:
print(news_data)

725

In [10]:
news_df = pd.DataFrame(news_data)
news_df

Unnamed: 0,link,title,date time,desc,content
0,https://www.moneycontrol.com/news/business/bud...,CEA Nageswaran says FY25 Budget's fiscal defic...,"February 03, 2024 09:50 AM IST",Nageswaran also rejected the K-shaped recovery...,The government's decision to target a fiscal d...
1,https://www.moneycontrol.com/news/business/bud...,Jobs outside formal economy not getting counte...,"February 02, 2024 08:09 PM IST","In a post-Budget interview with Rahul Joshi, E...",Finance Minister Nirmala Sitharaman has called...
2,https://www.moneycontrol.com/news/business/bud...,Sitharaman says ratings agencies should take n...,"February 02, 2024 07:04 PM IST",The Indian government has been in a running ba...,Finance Minister Nirmala Sitharaman said globa...
3,https://www.moneycontrol.com/news/business/the...,"The Reading List: February 2, 2024","February 02, 2024 06:15 PM IST",A selection of articles and social media gems ...,In a world strapped fo...
4,https://www.moneycontrol.com/news/business/mar...,"Business in the Week Ahead (February 5-9, 2024)","February 02, 2024 05:37 PM IST",The just-concluded Budget has stuck to the fis...,"The Budget, even thoug..."
...,...,...,...,...,...
720,https://www.moneycontrol.com/news/opinion/mpc-...,"MPC Meeting: Amid too many risks, status quo i...","December 09, 2023 10:21 AM IST",The combined fiscal-monetary risks and climate...,The RBI held its last Monetary Policy Committe...
721,https://www.moneycontrol.com/news/business/eco...,RBI Policy: No forward guidance again from Gov...,"December 08, 2023 06:13 PM IST",The ongoing uncertainty may have clouded the R...,For an interest rate decision that could be se...
722,https://www.moneycontrol.com/news/business/eco...,MC Interview: Hindalco has ability to invest i...,"December 09, 2023 06:48 AM IST",Pai expects India demand for aluminium and cop...,Hindalco Industries is well positioned to inve...
723,https://www.moneycontrol.com/news/business/ipo...,"Business in the Week Ahead (December 11-15, 2023)","December 08, 2023 03:46 PM IST",At least two new IPOs will open for subscripti...,The monetary policy co...


In [11]:
news_df.to_csv('economy_money_control.csv')

In [4]:
def fetchByLink(link:str,file_name:str,batch_size=10,total_nums=30):
    global news_data
    news_data = []
    
    global base_url
    base_url = link
    batches = int(total_nums/batch_size)
    
    for batch in range(batches):
        print('batch -- ',batch)
        for page in range(batch*batch_size,(batch+1)*batch_size):
            if page == 0:
                continue
            print("fetching page -- ",page)
            if not fetchPage(page):
                break
    news_df = pd.DataFrame(news_data)
    news_df.to_csv(file_name)

In [10]:
fetchByLink(link='https://www.moneycontrol.com/news/business-173.html',file_name="company-result-moneycontrol.csv")

batch --  0
fetching page --  1
https://www.moneycontrol.com/news/business-173.html
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
fetching page --  2
https://www.moneycontrol.com/news/business-173.html/page-2/
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
fetching page --  3
https://www.moneycontrol.com/news/business-173.html/page-3/
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
fetching page --  4
https://www.moneycontrol.com/news/business-173.html/page-4/
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
fetching page --  5
https://www.moneycontrol.com/news/business-173.html/page-5/
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
fetching page --  6
https://www.moneycontrol.com/news/business-173.html/page-6/
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21


In [9]:
news_df = pd.DataFrame(news_data)
news_df.to_csv("business-full-moneycontrol.csv")

In [20]:

def m_a(a):
    a.append(6)
# a = [6]
m_a(a)
a

[6, 6, 6, 6, 6, 6, 6, 6, 6, 6]

In [1]:
news_df

NameError: name 'news_df' is not defined