Import all necessary libraries:

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from pandas import ExcelWriter

Set up the crawler:

In [3]:
ticker = "BAS.DE" #important! use company ticker from yahoo finance here!
file_name = ticker + ".xlsx"
vfe = ["/financials?p=", "/balance-sheet?p=", "/cash-flow?p="]
df_list = []

base_url = "https://finance.yahoo.com/quote/"
urls = []

for statement in vfe:
    urls.append(base_url + ticker + statement + ticker)
        
for url in urls: #loop to get all financial data (BS, P&L, CF)
    page = requests.get(url)
    doc = BeautifulSoup(page.text, "html.parser")
    
    features = doc.find_all('div', class_='D(tbr)') #returns a list
    #print(features[0]) 
    #'div', class_='D(tbr)' is the class containing all other classes / data in each row (i.e. the overarching class)
    #features[0] = all data from first row; first row = header-row
    
    headers = []

    for item in features[0].find_all('div', class_='D(ib)'): #features[0] contains all data from first row; first row = header row
        headers.append(item.text)
    
    print(headers)
    
    index = 1 #start with second row since we already have data from header row
    data = []
    final = []
    while index <= len(features)-1:
    
        for item in features[index].find_all('div', class_='D(ib)'): #finds and extracts the label of each row
            data.append(item.text)
        for item in features[index].find_all('div', class_='Ta(c)'): #finds and extracts the data in each row
            data.append(item.text)
    
        final.append(data) # final has a lists-in-list structure (allows for easy transformation to pd.DataFrame)
        data = [] # clear data list after each row (i.e. after each features[index])
        index+=1
    print(final[:3]) #to check if the data from each statement is crawled correctly
    
    df = pd.DataFrame(final) #convert list 'final' to df
    df.columns = headers #specifiy column labels
    df.head()
    df.set_index('Breakdown') #set Breakdown column to be the index
    
    #function to make all values numerical
    def convert_to_numeric(column): #functions works as follows: a column is passed to the function and then firstly "," gets replace, secondly "-" gets replaced
        first_col = [i.replace(',','') for i in column] #without replacing the comma, pd.to_numeric() returns an error
        second_col = [i.replace('-','') for i in first_col]
        final_col = pd.to_numeric(second_col)

        return final_col

    for column in headers[1:]: #1: because we do not want to convert Breakdown column
        df[column] = convert_to_numeric(df[column])
        
    df_list.append(df)
    
with pd.ExcelWriter(file_name) as writer:
    for n, df in enumerate(df_list): #
        df.to_excel(writer, 'sheet%s' % str(n + 1), index=False) # use n from for loop here, otherwise each sheet will be the same
    writer.save()
            

['Breakdown', 'ttm', '12/31/2019', '12/31/2018', '12/31/2017', '12/31/2016']
[['Total Revenue', '60,473,000', '59,316,000', '62,675,000', '64,475,000', '57,550,000'], ['Cost of Revenue', '44,360,000', '43,061,000', '44,319,000', '43,929,000', '39,265,000'], ['Gross Profit', '16,113,000', '16,255,000', '18,356,000', '20,546,000', '18,285,000']]
['Breakdown', '12/31/2019', '12/31/2018', '12/31/2017', '12/31/2016']
[['Total Assets', '86,950,000', '86,556,000', '78,768,000', '76,496,000'], ['Total Liabilities Net Minority Interest', '44,600,000', '50,447,000', '44,012,000', '43,928,000'], ['Total Equity Gross Minority Interest', '42,350,000', '36,109,000', '34,756,000', '32,568,000']]
['Breakdown', 'ttm', '12/31/2019', '12/31/2018', '12/31/2017', '12/31/2016']
[['Operating Cash Flow', '6,071,000', '7,474,000', '7,939,000', '8,785,000', '7,717,000'], ['Investing Cash Flow', '-2,173,000', '-1,190,000', '-11,804,000', '-3,958,000', '-6,490,000'], ['Financing Cash Flow', '-2,731,000', '-6,405,