In [None]:
# Import required packages.
import time
import pandas as pd
import requests
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm_notebook

In [None]:
seed_url = 'https://www.moneycontrol.com/financials/hindustanunilever/results/quarterly-results/'
table_view_start = 2
table_view_end   = 8
company_performance_dict = {} # Create empty dictionary

Define a method 'process_row' that takes a row as an argument. It creates a key from first column and then for remaining columns it creates a list of values. Finally it adds key value pair to the global dictionary.

It also checks for the existance of key, if it id present then it extend list of values otherwise it creates new key, value pair.

In [None]:
def process_row(row):
    columns = row.find_all('td')
    key_column = columns[0].text
    q1_column = columns[1].text
    q2_column = columns[2].text
    q3_column = columns[3].text
    q4_column = columns[4].text
    q5_column = columns[5].text
    
    value_list = []
    value_list.append(q1_column)
    value_list.append(q2_column)
    value_list.append(q3_column)
    value_list.append(q4_column)
    value_list.append(q5_column)
    if key_column in company_performance_dict.keys():
        existing_list = company_performance_dict[key_column]
        existing_list.extend(value_list)
    else:
        # Add key, value pair to dictionary.
        company_performance_dict[key_column] = value_list

Define a method 'write_dictionary_to_file' which takes a file name as a parameter and write content of the global dictionary into csv file on the mounted drive.

In [None]:
company_codes = ['CPI','BI','CC10','DI','E06','GI22','GS','GCP','HAP','HFI',
                 'HU','ITC','JF04','M13','NI','PGH','TBE','TT','UB02','US']
len(company_codes)

20

In [None]:
company_codes[4:7]

['E06', 'GI22', 'GS']

In [None]:
# Iterate through list of companies.
df = pd.DataFrame()

for company_code in tqdm_notebook(company_codes):
    print("Extracting data for company: ",company_code)
    time.sleep(5)
    company_performance_dict = {}
    # Iterate through each page
    
    for table_view in range (table_view_start, table_view_end):
        
        # Customize URL to make it company specific
        full_url   = seed_url + company_code + "/" + str(table_view) + "#" + company_code

        # Make request to fetch content from full_url & store page content into local object
        response = requests.get(full_url)
        page = BeautifulSoup(response.content, "html.parser")

        # Meaningfull content we are interested in are available in table, hence we will find
        # table from the page and work through it to scrape necessary data
        table = page.find("table", attrs={"class", "mctable1"})

        # Once we have table, we will select all rows within it
        #table_body = table.find('tbody')
        rows = table.find_all('tr')
        for row in rows:
          # Call function for each row
            try:
                process_row(row)
            except Exception:
                pass
          
            
    company_df = pd.DataFrame.from_dict(company_performance_dict)
    company_df.insert(1, "Ticker", company_code)
    company_df.rename(columns={ company_df.columns[0]: "Quarters" }, inplace=True)
    
    df = pd.concat([df,company_df],axis=0)
    

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=20.0), HTML(value='')))

Extracting data for company:  CPI
Extracting data for company:  BI
Extracting data for company:  CC10
Extracting data for company:  DI
Extracting data for company:  E06
Extracting data for company:  GI22
Extracting data for company:  GS
Extracting data for company:  GCP
Extracting data for company:  HAP
Extracting data for company:  HFI
Extracting data for company:  HU
Extracting data for company:  ITC
Extracting data for company:  JF04
Extracting data for company:  M13
Extracting data for company:  NI
Extracting data for company:  PGH
Extracting data for company:  TBE
Extracting data for company:  TT
Extracting data for company:  UB02
Extracting data for company:  US



In [None]:
df.head()

Unnamed: 0,Quarters,Ticker,Net Sales/Income from operations,Other Operating Income,Total Income From Operations,EXPENDITURE,Consumption of Raw Materials,Purchase of Traded Goods,Increase/Decrease in Stocks,Power & Fuel,...,Share Holding (%),Promoters and Promoter Group Shareholding,a) Pledged/Encumbered,- Number of shares (Crores),- Per. of shares (as a % of the total sh. of prom. and promoter group),- Per. of shares (as a % of the total Share Cap. of the company),b) Non-encumbered,- Number of shares (Crores).,- Per. of shares (as a % of the total sh. of prom. and promoter group).,- Per. of shares (as a % of the total Share Cap. of the company).
0,Mar '21,CPI,1275.01,8.18,1283.19,,324.31,72.28,18.26,--,...,--,,,--,--,--,,--,--,--
1,Dec '20,CPI,1224.21,7.72,1231.93,,317.31,64.82,-9.6,--,...,--,,,--,--,--,,--,--,--
2,Sep '20,CPI,1277.66,7.82,1285.48,,365.71,103.83,-59.91,--,...,--,,,--,--,--,,--,--,--
3,Jun '20,CPI,1033.6,7.02,1040.62,,242.25,82.66,27.45,--,...,--,,,--,--,--,,--,--,--
4,Mar '20,CPI,1062.35,8.91,1071.26,,321.12,58.84,-1.34,--,...,--,,,--,--,--,,--,--,--


In [None]:
df.shape

(600, 51)

In [None]:
df.Ticker.value_counts()

UB02    30
CC10    30
HFI     30
BI      30
TBE     30
GCP     30
GS      30
US      30
NI      30
E06     30
ITC     30
PGH     30
GI22    30
TT      30
HAP     30
HU      30
M13     30
DI      30
CPI     30
JF04    30
Name: Ticker, dtype: int64

In [None]:
df.Quarters.value_counts()

Jun '17    22
Jun '14    21
Jun '18    21
Jun '16    20
Mar '20    20
Sep '16    20
Dec '20    20
Mar '16    20
Jun '15    20
Mar '18    20
Dec '15    20
Sep '18    20
Sep '19    20
Sep '17    20
Mar '19    20
Dec '18    20
Dec '16    20
Dec '17    20
Jun '20    20
Sep '20    20
Mar '17    20
Jun '19    20
Dec '14    20
Sep '14    20
Dec '19    20
Mar '15    20
Mar '21    20
Sep '15    20
Mar '14    18
Dec '13    17
Sep '13     1
Name: Quarters, dtype: int64

In [None]:
df.to_csv(r'G:\ISB AMPBA\15. Supervised Learning 1\FMCG_combined_company_data.csv',index=False)

This raw data captured will be cleaned and processed for use in the ML Algorithm 
## END