In [1]:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.common.exceptions import TimeoutException,NoSuchAttributeException,NoSuchElementException
from selenium.webdriver.support.wait import WebDriverWait
from requests.exceptions import TooManyRedirects
import re
import os
import sys
import time
import requests
import string
from threading import Thread

In [3]:
def task_completion(home_dir):
    try:
        os.mknod(home_dir+'/.done')
    except:
        print(home_dir+'/.done',file=sys.stderr)
    
def task_completion_status(home_dir):
    return os.path.exists(home_dir+'/.done')
    
def format_filename(name):
    valid_chars = "-_.() %s%s" % (string.ascii_letters, string.digits)
    filename = ''.join([c for c in name if c in valid_chars])
    filename = filename.replace(' ','_')
    return filename

In [4]:
def get_bs4(browser):
    return BeautifulSoup(browser.page_source,'html.parser')

def get_bs4_requests(link):
    for tries in range(7):
        try:
            link="https://www.moneycontrol.com"+link if not link.startswith("https://www.moneycontrol.com") else link
            headers = {'user-agent':'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'}
            source = requests.get(link,headers=headers,timeout=20).content
            return BeautifulSoup(source,'html.parser')
        except TooManyRedirects:
            break
        except Exception as e:
            print("Tries {0} of 7".format(tries+1),e,file=sys.stderr)
    return None

In [5]:
def get_browser(link):
    if not "moneycontrol.com" in link:
        link="https://www.moneycontrol.com"+link
    options = webdriver.ChromeOptions()
    options.add_experimental_option('prefs',{"profile.managed_default_content_settings.images": 2,"profile.default_content_settings.state.flash": 0})
    driver = webdriver.Chrome(executable_path=os.getcwd()+'/chromedriver',chrome_options=options)
    driver.set_page_load_timeout(60 if "all-companies" in link else 20)
    for tries in range(7):
        try:
            driver.get(link)
            return driver
        except TimeoutException as e:
            print("Tries {0} of 7 Exception : Timeout Occured".format(tries+1),file=sys.stderr)
        except Exception as e:
            print("Unable to load Website Exception",e,file=sys.stderr)
            driver.close()
            return None
    driver.close()
    return None

In [6]:
class News:
    
    def __init__(self,link,parent_dir,name):
        self.ct = 0
        self.name = name
        self.link = link
        self.home_dir = parent_dir+'/News'
        check_home_folder(self.home_dir)
    
    def get_detailed_news(self,links):
        for link in links:
            soup = get_bs4_requests(link)
            try:
                for script in soup(['script','style']):
                    script.decompose()
                article_box = soup.find('div',attrs = {'class':'article_box'})
                date = article_box.find('div',attrs = {'class':'arttidate'}).text
                title = article_box.find('h1',attrs = {'class':'artTitle'}).text
                article_box = soup.find('article',attrs = {'class':'article_box'})
                article_flow = article_box.find('div',attrs={'class':'arti-flow'})
                data = article_flow.text
                #data += "\n".join([para.text for para in article_flow.find_all('p')])
                #data += "\n\n\n\n\n"
                rows = article_flow.find_all('tr')
                for row in rows:
                    data += "*\t*".join([td.text for td in row.find_all('td')])
                    data += "\n"
                if len(data.strip())>0:
                    write_to_file(self.home_dir,title,date+"\n\n"+data)
            except Exception as e:
                self.ct+=1
    
    def get_news_from_page(self,page):
        soup = get_bs4_requests(page)
        try:
            news =[ x['href'] for x in soup.find_all('a',attrs = {'class':'g_14bl'}) ]
            return  news
        except:
            return []
        
    def get_news_link(self,years):
        links = []
        for year in years:
            soup = get_bs4_requests(year)
            if soup:
                try:
                    pages = [ anchor['href'] for anchor in soup.find('div',attrs={'class':'pages MR10 MT15'}).find_all('a')]
                    pages.append(year)
                    for page in pages:
                        news_from_page = self.get_news_from_page(page)
                        links.extend(news_from_page)
                except:
                    print("Failed to load news",file=sys.stderr)
            else:
                print("Failed to load page",file=sys.stderr)
        return links
    
    def start(self):
        if task_completion_status(self.home_dir):
            return
        print("News Scraping Started for {}".format(self.name))
        soup = get_bs4_requests(self.link)
        if soup:
            years = self.get_years(soup)
            news_link = self.get_news_link(years)
            self.get_detailed_news(news_link)
            task_completion(self.home_dir)
        else:
            print("Connection Timed Out",file=sys.stderr)
            
    def get_years(self,soup):
        container = soup.find('div',attrs={'class':'FR yrs'})
        return [anchor['href'] for anchor in container.find_all('a')]        

In [7]:
class Company:
    
    def __init__(self,link,name,parent_dir):
        self.link = link
        self.name = name
        self.home_dir = (parent_dir+'/'+format_filename(name))
        check_home_folder(self.home_dir)
    
    def start(self):
        if task_completion_status(self.home_dir):
            return
        soup = get_bs4_requests(self.link)
        if soup==None:
            print("Could not open Homepage for company {0}".format(self.name),file=sys.stderr)
            return
        left_nav_bar = soup.find('div',attrs = {'class':'FL leftNav'})
        for anchor in left_nav_bar.find_all('a'):
            if anchor.text=='NEWS':
                News(anchor['href'],self.home_dir,self.name).start()

In [8]:
class WebPages:
    
    def __init__(self,link,home_dir):
        self.link = link
        self.home_dir=home_dir
        check_home_folder(home_dir)
        self.browser = get_browser(link)
        if not self.browser:
            raise(Exception('Could not open Webpage {}'.format(link)))
            
    def begin(self):
        pages = []
        soup = get_bs4(self.browser)
        div = soup.find('div',attrs ={'class':'bsr_table hist_tbl_hm'})
        rows = div.find('tbody').find_all('td',attrs = {'class':'PR'})
        for row in rows:
            anchor = row.find('a')
            pages.append((anchor.text,anchor['href']))
            Company(anchor['href'],anchor.text,self.home_dir).start()
        self.browser.close()

In [None]:
home = '/home/akash/Desktop/ScrapedData'
links = ['https://www.moneycontrol.com/stocks/marketstats/bse-gainer/all-companies_-1/more/','https://www.moneycontrol.com/stocks/marketstats/bse-loser/all-companies_-1/more/']
threads = list()
for link in links:
    if 'gainer' in link:
        continue
    try:
        webpage = Thread(target=WebPages(link,home).begin)
        webpage.start()
        threads.append(webpage)
    except Exception as e:
        print("Failed to open {0}".format(link),e,file=sys.stderr)
for webpage in threads:
    webpage.join()

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHA

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


News Scraping Started for ONGC


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHA

News Scraping Started for Wendt
News Scraping Started for Fedders Elec


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


News Scraping Started for Sterlite Techno


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Failed to load news


News Scraping Started for AB Money


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


News Scraping Started for Take Solutions


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


News Scraping Started for V2 Retail


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


News Scraping Started for Jindal Drilling


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


News Scraping Started for Patels Airtemp
News Scraping Started for Indo Borax
News Scraping Started for Super Sales
News Scraping Started for NBCC (India)


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Tries 1 of 7 HTTPSConnectionPool(host='www.moneycontrol.com', port=443): Read timed out. (read timeout=20)


News Scraping Started for Ambition Mica


Tries 1 of 7 HTTPSConnectionPool(host='www.moneycontrol.com', port=443): Read timed out. (read timeout=20)
Tries 2 of 7 HTTPSConnectionPool(host='www.moneycontrol.com', port=443): Read timed out. (read timeout=20)


News Scraping Started for Bharat Seats
News Scraping Started for Hercules Hoists


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


News Scraping Started for RCI Industries
News Scraping Started for Super Crop Safe


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


News Scraping Started for Granules India


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHA

News Scraping Started for MAS Financial S


Tries 1 of 7 HTTPSConnectionPool(host='www.moneycontrol.com', port=443): Read timed out. (read timeout=20)
Tries 2 of 7 HTTPSConnectionPool(host='www.moneycontrol.com', port=443): Read timed out. (read timeout=20)
Tries 1 of 7 HTTPSConnectionPool(host='www.moneycontrol.com', port=443): Read timed out. (read timeout=20)
Tries 2 of 7 HTTPSConnectionPool(host='www.moneycontrol.com', port=443): Read timed out. (read timeout=20)
Tries 3 of 7 HTTPSConnectionPool(host='www.moneycontrol.com', port=443): Read timed out. (read timeout=20)
Tries 4 of 7 HTTPSConnectionPool(host='www.moneycontrol.com', port=443): Read timed out. (read timeout=20)


News Scraping Started for Pricol


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


News Scraping Started for Guj Sidhee Cem


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


News Scraping Started for Bharat Elec


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Tries 1 of 7 HTTPSConnectionPool(host='www.moneycontrol.com', port=443): Read timed out. (read timeout=20)
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Tries 1 of 7 HTTPSConnectionPool(host='www.moneycontrol.com', port=443): Read timed out. (read timeout=20)
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be d

News Scraping Started for Unichem Labs


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


News Scraping Started for Tata Comm


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Tries 1 of 7 HTTPSConnectionPool(host='www.moneycontrol.com', port=443): Rea