# Industries

In [35]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from fake_useragent import UserAgent
import random
import sqlite3

In [29]:
# Function to get headers with a random user agent
def get_headers():
    ua = UserAgent()
    headers = {
        'User-Agent': ua.random,
        'Accept-Language': 'en-US,en;q=0.9',
        'Accept-Encoding': 'gzip, deflate, br',
        'Connection': 'keep-alive',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'DNT': '1',  # Do Not Track Request Header
        'Upgrade-Insecure-Requests': '1'
    }
    return headers

In [30]:
# Base URL for stock analysis by industry
url = 'https://stockanalysis.com/stocks/industry/'
headers = get_headers()

# Send request to the URL
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.text, 'lxml')

# Extract sector titles
sectors = soup.find_all('div', {'class':'title-group'})
sector_titles = [title.text.split(':')[1].strip() for title in sectors]

# Extract industry tables
sector_div = soup.find('div', {'class':'space-y-8'})
tables = sector_div.find_all('table', {'class':'svelte-qmv8b3'})

# Extract entries from tables
entries = []
for table in tables:
    industries = table.find_all('tr', {'class':'svelte-qmv8b3'})
    industries = industries[1:]  # Skip the header row
    entries.append(industries)

# Verify lengths of sector titles and entries
len(sector_titles), len(entries)

(11, 11)

In [None]:
# List of sector titles and their corresponding entries
industry_zip = list(zip(sector_titles, entries))
final_df = pd.DataFrame()

# Iterate over each sector and its entries
for sector_name, sector_entry in industry_zip:
    
    all_df = pd.DataFrame()
    
    for entry in sector_entry:
        # Extract link, day & year change for each entry 
        tds = entry.find_all('td')
        ind_url = tds[0].find('a')['href']
        day_change = tds[-2].text.strip().replace('%', '')
        y_change = tds[-1].text.strip().replace('%', '')
        
        link = f'https://stockanalysis.com{ind_url}'
        # Get headers with a random user agent
        headers = get_headers()  
        
        # Send request to the industry details page
        r = requests.get(link, headers=headers)
        soup = BeautifulSoup(r.text, 'lxml')
        name = soup.find('h1').text.strip()    
        
        # Extract stock number
        stock_no = soup.find('div', {'class':'title-group'}).text.strip()
        stock_no = int(stock_no.split()[0])
        
        # Extract financial metrics
        metrics = soup.find_all('div', {'class':'mt-0.5 text-lg font-semibold bp:text-xl sm:mt-1.5 sm:text-2xl'})

        # Market cap in millions
        cap_text = metrics[0].text.replace(',', '')
        if 'B' in cap_text:
            cap = float(cap_text.replace('B', '')) * 1000
        else:
            cap = float(cap_text.replace('M', ''))

        # Revenue in millions
        revenue_text = metrics[1].text.replace(',', '')
        if 'B' in revenue_text:
            revenue = float(revenue_text.replace('B', '')) * 1000
        else:
            revenue = float(revenue_text.replace('M', ''))

        # Profits in millions
        profits_text = metrics[2].text.replace(',', '')
        if profits_text == 'n/a':
            profits = None
        elif 'B' in profits_text:
            profits = float(profits_text.replace('B', '')) * 1000
        else:
            profits = float(profits_text.replace('M', ''))
        
        # PE Ratio
        pe_ratio = metrics[3].text.strip().replace(',', '')
        pe_ratio = None if pe_ratio == 'n/a' else float(pe_ratio)
        
        # Profit Margin
        pf_margin = metrics[4].text.strip().replace('%', '')
        pf_margin = None if pf_margin == 'n/a' else float(pf_margin)
        
        # Dividend Yield
        div_yield = metrics[-1].text.replace('%', '').strip()
        div_yield = None if div_yield == 'n/a' else float(div_yield)
        
        # Create dictionary for entry
        entry_dict = {
            'Industry': [name],
            'Stock Number': [stock_no],
            'Market Cap($M)': [cap],
            'Revenue($M)': [revenue],
            'Profits($M)': [profits],
            'Dividend Yield(%)': [div_yield],
            'PE Ratio': [pe_ratio],
            'Profit Margin(%)': [pf_margin],
            'Day Change(%)': [day_change],
            'Year Change(%)': [y_change],
            'Link': [link]
        }

        # Create DataFrame for the entry and concatenate it to all_df
        df = pd.DataFrame(entry_dict)
        all_df = pd.concat([all_df, df], ignore_index=True)
        all_df['Sector'] = sector_name
    
    # Concatenate all_df to final_df and add timestamp
    final_df = pd.concat([final_df, all_df], ignore_index=True)
    final_df['Time(As of)'] = pd.Timestamp.now().strftime('%Y-%m-%d %H:%M')

In [34]:
# Display DataFrame information
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145 entries, 0 to 144
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Industry           145 non-null    object 
 1   Stock Number       145 non-null    int64  
 2   Market Cap($M)     145 non-null    float64
 3   Revenue($M)        145 non-null    float64
 4   Profits($M)        145 non-null    float64
 5   Dividend Yield(%)  143 non-null    float64
 6   PE Ratio           132 non-null    float64
 7   Profit Margin(%)   145 non-null    float64
 8   Day Change(%)      145 non-null    object 
 9   Year Change(%)     145 non-null    object 
 10  Link               145 non-null    object 
 11  Sector             145 non-null    object 
 12  Time(As of)        145 non-null    object 
dtypes: float64(6), int64(1), object(6)
memory usage: 14.9+ KB


In [38]:
# Save to SQLite Database
conn = sqlite3.connect('data/Marketcap.db')
cursor = conn.cursor()

# Insert to table
final_df.to_sql('Industries', conn, if_exists='append', index=False)

conn.commit()

print('Data successfully added to database table')

# Close the connection
conn.close()

Data successfully added to database table


In [82]:
# Aggregate data by sector
sect_df = final_df.groupby('Sector').agg({
    'Market Cap($M)': 'sum',
    'Revenue($M)': 'sum',
    'Profits($M)': 'sum',
    'Profit Margin(%)': 'mean',
    'Day Change(%)': 'mean',
    'Year Change(%)': 'mean',
    'Stock Number': 'sum'
}).reset_index()

# Round metrics to 2 decimal places
sect_df = sect_df.round({
    'Market Cap($M)': 1,
    'Revenue($M)': 1,
    'Profits($M)': 2,
    'Profit Margin(%)': 2,
    'Day Change(%)': 2,
    'Year Change(%)': 2
})

sect_df['Time(As of)'] = pd.Timestamp.now().strftime('%Y-%m-%d %H:%M')

# Save to SQLite Database
conn = sqlite3.connect('data/Marketcap.db')
cursor = conn.cursor()

# Insert to table
sect_df.to_sql('Sectors', conn, if_exists='append', index=False)

conn.commit()

print('Data successfully added to database table')

# Close the connection
conn.close()

Data successfully added to database table


In [79]:
# Display available metrics
metrics = ['Market Cap($M)', 'Revenue($M)', 'Profits($M)', 'Profit Margin(%)', 'Day Change(%)', 'Year Change(%)']
print("Available metrics to sort by:")
i = 1
for metric in metrics:
    print(f"{i}. {metric}")
    i = i + 1

# Ask user for their choice
user_choice = int(input("Enter the metric you want to sort by: "))
if user_choice > 6:
    print(f"\nInvalid choice")

else:
    choice = metrics[user_choice-1]
    # Sort aggregated data by the chosen metric
    sorted_df = sect_df.sort_values(by=choice, ascending=False)
    sorted_df.reset_index(drop=True, inplace=True)

    # Display the sorted DataFrame
    print(f"\nThese are the top sectors today by {choice}:")
    
# Display the sorted DataFrame
sorted_df

Available metrics to sort by:
1. Market Cap($M)
2. Revenue($M)
3. Profits($M)
4. Profit Margin(%)
5. Day Change(%)
6. Year Change(%)

These are the top sectors today by Revenue($M):


Unnamed: 0,Sector,Market Cap($M),Revenue($M),Profits($M),Profit Margin(%),Day Change(%),Year Change(%),Stock Number,Time(As of)
0,Consumer Discretionary,7427120.0,4457870.0,271900.85,5.085217,0.891739,9.917391,578,2024-07-31 18:54
1,Healthcare,8475000.0,4002200.0,130411.28,0.919091,0.559091,10.981818,1180,2024-07-31 18:54
2,Financials,10461760.0,3864660.0,677316.53,13.790667,0.633333,26.722667,951,2024-07-31 18:54
3,Energy,3739660.0,3479590.0,333229.84,11.0525,1.27375,11.04875,250,2024-07-31 18:54
4,Technology,19441280.0,3003740.0,408212.15,9.776667,2.578333,29.823333,787,2024-07-31 18:54
5,Consumer Staples,3962070.0,2900970.0,146179.09,7.329167,-0.256364,6.373333,251,2024-07-31 18:54
6,Industrials,5791200.0,2825170.0,215505.06,7.5476,0.956,24.2408,652,2024-07-31 18:54
7,Communication Services,5690790.0,1905870.0,196353.67,3.894286,0.755714,12.64,250,2024-07-31 18:54
8,Materials,2135030.0,1168828.05,77880.53,4.715714,1.661429,10.476429,271,2024-07-31 18:54
9,Utilities,1504890.0,676210.0,62840.0,9.735,2.6,26.748333,109,2024-07-31 18:54


# Company profiles

### **Key metrics**


#### Income Statement
1. **Revenue**: - Indicates the company's total income from its operations.
2. **Operating Expense**: - Reflects the costs required to run the company’s core business.
3. **Net Income**: - Represents the company's profit after all expenses have been deducted from revenues.
4. **Net Profit Margin**: - Shows the percentage of revenue that remains as profit after all expenses.
5. **Earnings Per Share (EPS)**: - Measures the profitability on a per-share basis.
6. **EBITDA**: - Earnings before interest, taxes, depreciation, and amortization, indicating operational profitability.
7. **Effective Tax Rate**:  - The average rate at which the company’s pre-tax profits are taxed.

#### Balance Sheet
1. **Cash and Short-term Investments**: - Indicates the liquidity and available capital for operations and investments.
2. **Total Assets**: - Represents the total resources owned by the company.
3. **Total Liabilities**:- Reflects the company's total obligations.
4. **Total Equity**: - Indicates the net worth of the company.
5. **Shares Outstanding**: - Represents the total number of shares currently held by all shareholders.
6. **Price to Book (P/B) Ratio**: - Compares the company's market value to its book value.
7. **Return on Assets (ROA)**: - Measures how efficiently the company is using its assets to generate profit.
8. **Return on Capital (ROC)**: - Indicates the return the company is generating on its invested capital.

#### Cash Flow Statement
1. **Cash from Operations**: - Represents the cash generated from the company's core business operations.
2. **Cash from Investing**: - Indicates cash spent on investments like equipment, acquisitions, etc.
3. **Cash from Financing**: - Reflects cash flows related to borrowing and repaying debt, issuing stock, etc.
4. **Net Change in Cash**:  - Shows the overall change in the company's cash position.
5. **Free Cash Flow (FCF)**: - Indicates the cash available after capital expenditures for the company to expand, pay dividends, etc.


### Data Extraction

In [179]:
base = "https://www.google.com/finance"
index = "NASDAQ"
symbol = "AMZN"
lang = "en"
url = f"{base}/quote/{symbol}:{index}?hl={lang}"

page = requests.get(url)

# Use an HTML parser to grab the content from "page"
soup = BeautifulSoup(page.content, "html.parser")

# Extract stock info
stock_description = {}
stock_description['Name'] = soup.find('div', {'class':'zzDege'}).text.strip()
stock_description['Symbol'] = symbol
about = soup.find('div', {'class':'bLLb2d'}).text.strip()
stock_description['About'] = about

items = soup.find_all("div", {"class": "gyFHrc"})
for item in items:
    item_description = item.find("div", {"class": "mfs7Fc"}).text
    item_value = item.find("div", {"class": "P6K39c"}).text
    stock_description[item_description] = item_value

# Get data from company financial sheets
tables = soup.find_all('table', {'class':'slpEwd'})

# Company's income sheet
income = tables[0]

fin_entries = income.find_all('tr')
fin_entries = fin_entries[1:]

income_info = {}
for item in fin_entries:
    item_title = item.find("div", {"class": "rsPbEe"}).text
    item_description = item.find("div", {"class": "EY8ABd-OWXEXe-TAWMXe"}).text
    income_info[item_title] = item_description 
    income_info[f'{item_title} value'] = item.find('td', {'class':'QXDnM'}).text.strip()
    change = item.find('td', {'class':'gEUVJe'}).text.replace('%', '') 
    change = None if change == '—' else float(change)
    income_info[f'{item_title} Y/Y change(%)'] = change
    
stock_description['Income'] = income_info

# Company's balance sheet
bal_sheet = tables[1]
bal_entries = bal_sheet.find_all('tr')
bal_entries = bal_entries[1:]

balance_info = {}
for item in bal_entries:
    item_title = item.find("div", {"class": "rsPbEe"}).text
    item_description = item.find("div", {"class": "EY8ABd-OWXEXe-TAWMXe"}).text
    balance_info[item_title] = item_description 
    balance_info[f'{item_title} value'] = item.find('td', {'class':'QXDnM'}).text.strip()
    change = item.find('td', {'class':'gEUVJe'}).text.replace('%', '')
    change = None if change == '—' else float(change)  
    balance_info[f'{item_title} Y/Y change(%)'] = change
    
stock_description['Balance Sheet'] = balance_info

# Company's cash flow
cash_sheet = tables[2]
cash_entries = cash_sheet.find_all('tr')
cash_entries = cash_entries[1:]

cash_info = {}
for item in cash_entries:
    item_title = item.find("div", {"class": "rsPbEe"}).text
    item_description = item.find("div", {"class": "EY8ABd-OWXEXe-TAWMXe"}).text
    cash_info[item_title] = item_description 
    cash_info[f'{item_title} value'] = item.find('td', {'class':'QXDnM'}).text.strip()
    change = item.find('td', {'class':'gEUVJe'}).text.replace('%', '')
    change = None if change == '—' else float(change)  
    cash_info[f'{item_title} Y/Y change(%)'] = change
    
stock_description['Cash Flow'] = cash_info


{'Name': 'Amazon.com Inc', 'Symbol': 'AMZN', 'About': 'Amazon.com, Inc., doing business as Amazon, is an American multinational technology company, engaged in e-commerce, cloud computing, online advertising, digital streaming, and artificial intelligence. It is considered one of the Big Five American technology companies; the other four are Alphabet, Apple, Meta, and Microsoft.\nAmazon was founded on July 5, 1994, by Jeff Bezos in Bellevue, Washington. The company originally started as an online marketplace for books but gradually expanded its offerings to include a wide range of product categories. This diversification led to it being referred to as "The Everything Store".\nThe company has multiple subsidiaries, including Amazon Web Services, providing cloud computing, Zoox, a self-driving car division, Kuiper Systems, a satellite Internet provider, and Amazon Lab126, a computer hardware R&D provider. Other subsidiaries include Ring, Twitch, IMDb, and Whole Foods Market. Its acquisiti

In [180]:

stock_description

{'Name': 'Amazon.com Inc',
 'Symbol': 'AMZN',
 'About': 'Amazon.com, Inc., doing business as Amazon, is an American multinational technology company, engaged in e-commerce, cloud computing, online advertising, digital streaming, and artificial intelligence. It is considered one of the Big Five American technology companies; the other four are Alphabet, Apple, Meta, and Microsoft.\nAmazon was founded on July 5, 1994, by Jeff Bezos in Bellevue, Washington. The company originally started as an online marketplace for books but gradually expanded its offerings to include a wide range of product categories. This diversification led to it being referred to as "The Everything Store".\nThe company has multiple subsidiaries, including Amazon Web Services, providing cloud computing, Zoox, a self-driving car division, Kuiper Systems, a satellite Internet provider, and Amazon Lab126, a computer hardware R&D provider. Other subsidiaries include Ring, Twitch, IMDb, and Whole Foods Market. Its acquisi