In [44]:
%matplotlib widget
import requests
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import math
import os

In [51]:
class YahooCompanyScraper:
    def __init__(self, company_symbol, output_folder="") -> None:
        self.company_symbol = company_symbol
        self.base_url = "https://fr.finance.yahoo.com/quote"
        self.df_financials = self.get_financials()
        self.df_balance_sheet = self.get_balance_sheet()
        self.df_cash_flow = self.get_cash_flow()
        # self.best_cac40_symbols = [self.get_best_cac40_company_symbols()]
        if output_folder:
            self.output_folder = output_folder
        else:
            self.output_folder = f"./{self.company_symbol}"
       
    def get_financial_statement(self, financial_statement):
        url = f"{self.base_url}/{self.company_symbol}/{financial_statement}?p={self.company_symbol}"
        with requests.session():
            header = {'Connection': 'keep-alive',
                    'Expires': '-1',
                    'Upgrade-Insecure-Requests': '1',
                    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) \
                    AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36'
                    }
            page = requests.get(url, headers=header)
            soup = BeautifulSoup(page.content, 'html.parser')
            table = soup.find_all('div', class_="M(0) Whs(n) BdEnd Bdc($seperatorColor) D(itb)")
            return table
    
    def get_headers(self, raw_financial_statement):
        soup = BeautifulSoup(str(raw_financial_statement[0]), "html.parser")
        raw_table_headers = soup.find("div", class_="D(tbhg)").find_all("span")
        
        headers = [i.text for i in raw_table_headers]
        self.financials_headers = headers[1:]
        return headers[1:]
    
    def isStringNumber(self, string):
        try:
            string = int(string)
            return True
        except:
            return False

    def get_financial_statement_data(self, raw_financial_statement, financial_statement_headers):
        res = {}
        soup = BeautifulSoup(str(raw_financial_statement[0]), "html.parser")
        cells = soup.find_all("div", class_="D(tbc)")
        for i in range(0, len(cells), len(financial_statement_headers) + 1):
            res[cells[i].text.replace('\u202f', '')] = [cell.text.replace('\u202f', '') for cell in cells[i+1:i+len(financial_statement_headers) + 1]]
        return res
    
    def get_financials(self):
        raw_financials = self.get_financial_statement("financials")
        raw_financials_headers = self.get_headers(raw_financials)
        financials_data = self.get_financial_statement_data(raw_financials, raw_financials_headers)
        df = pd.DataFrame(financials_data)
        df = df.transpose()
        df.columns = raw_financials_headers
        df = df.rename_axis('Details')
        df.replace({ '': math.nan })
        return df
    
    def get_balance_sheet(self):
        raw_balance_sheet = self.get_financial_statement("balance-sheet")
        raw_balance_sheet_headers = self.get_headers(raw_balance_sheet)
        balance_sheet_data = self.get_financial_statement_data(raw_balance_sheet, raw_balance_sheet_headers)
        df = pd.DataFrame(balance_sheet_data)
        df = df.transpose()
        df.columns = raw_balance_sheet_headers
        df = df.rename_axis('Details')
        df.replace({ '': math.nan })
        return df
    
    def get_cash_flow(self):
        raw_cash_flow = self.get_financial_statement("cash-flow")
        raw_cash_flow_headers = self.get_headers(raw_cash_flow)
        cash_flow_data = self.get_financial_statement_data(raw_cash_flow, raw_cash_flow_headers)
        df = pd.DataFrame(cash_flow_data)
        df = df.transpose()
        df.columns = raw_cash_flow_headers
        df = df.rename_axis('Details')
        df.replace({ '': math.nan })
        return df
    
    def export_all_financial_statements(self):
        self.create_output_folder(self.output_folder)
        self.df_financials.to_csv(f"{self.output_folder}/{self.company_symbol}_financials.csv")
        self.df_balance_sheet.to_csv(f"{self.output_folder}/{self.company_symbol}_balance_sheet.csv")
        self.df_cash_flow.to_csv(f"{self.output_folder}/{self.company_symbol}_cash_flow.csv")
    
    def create_output_folder(self, path):
        if not os.path.exists(path):
            os.makedirs(path)
        
    
    def refetch_data(self):
        self.df_financials = self.get_financials()
        self.df_balance_sheet = self.get_balance_sheet()
        self.df_cash_flow = self.get_cash_flow()
    
    def get_best_cac40_company_symbols(self):
        url = "https://fr.finance.yahoo.com/quote/%5EFCHI/components?p=%5EFCHI"
        res = []
        with requests.session():
            header = {'Connection': 'keep-alive',
                    'Expires': '-1',
                    'Upgrade-Insecure-Requests': '1',
                    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) \
                    AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36'
                    }
            page = requests.get(url, headers=header)
            soup = BeautifulSoup(page.content, 'html.parser')
            table = soup.find_all('tr', class_="BdT Bdc($seperatorColor) Ta(end) Fz(s)")
            for row in table:
                res.append(row.find_all('td', class_="Py(10px) Ta(start) Pend(10px)")[0].text)
            return res
            

In [52]:
Loreal = YahooCompanyScraper('OR.PA')


In [53]:
Loreal.export_all_financial_statements()

In [31]:
Loreal.df_financials.to_csv('Loreal.csv')

In [39]:
cac40_symbols = Loreal.get_best_cac40_company_symbols()
for symbol in cac40_symbols:
    try:
        company = YahooCompanyScraper(symbol)
        company.export_all_financial_statements()
    except:
        print(f"Error while fetching {symbol}")

Error while fetching UG.PA
Error while fetching FP.PA
