# Industries

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import random
import sqlite3
import warnings
import os, sys
sys.path.append(os.path.abspath('../functions'))
from common_functions import *

####  Stock data extraction

In [4]:
# Base URL for stock analysis by industry
url = 'https://stockanalysis.com/stocks/industry/'
headers = get_headers()

# Send request to the URL
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.text, 'lxml')

# Extract sector titles
sectors = soup.find_all('div', {'class':'title-group'})
sector_titles = [title.text.split(':')[1].strip() for title in sectors]

# Extract industry tables
sector_div = soup.find('div', {'class':'space-y-8'})
tables = sector_div.find_all('table', {'class':'svelte-qmv8b3'})

# Extract entries from tables
entries = []
for table in tables:
    industries = table.find_all('tr', {'class':'svelte-qmv8b3'})
    industries = industries[1:]  # Skip the header row
    entries.append(industries)

# Verify lengths of sector titles and entries
len(sector_titles), len(entries)

(11, 11)

In [None]:
# List of sector titles and their corresponding entries
industry_items = list(zip(sector_titles, entries))

# Turn off FutureWarnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Initialize master dataframes to store final results
final_df = pd.DataFrame()
final_stock = pd.DataFrame()

# Iterate over each sector and its entries  
for sector_name, sector_entry in industry_items[:2]:
    
    # Initialize dataframes for the current sector
    all_df = pd.DataFrame()
    all_stock = pd.DataFrame()
    
    # Iterate over each entry in the current sector
    for entry in sector_entry:
        # Extract link, day & year change for each entry 
        tds = entry.find_all('td')
        ind_url = tds[0].find('a')['href']
        day_change = tds[-2].text.strip().replace('%', '')
        y_change = tds[-1].text.strip().replace('%', '')
        
        # Construct the full link to the industry details page
        link = f'https://stockanalysis.com{ind_url}'
        
        # Get headers with a random user agent
        headers = get_headers()  
        
        # Send request to the industry details page
        r = requests.get(link, headers=headers)
        soup = BeautifulSoup(r.text, 'lxml')
        
        # Extract industry name
        name = soup.find('h1').text.strip()    
        
        # Extract stock number
        stock_no = soup.find('div', {'class':'title-group'})
        if stock_no:
            stock_number = stock_no.text.strip()
            stock_no = int(stock_number.split()[0])
        else:
            stock_number = None 
        
        # Extract financial metrics
        metrics = soup.find_all('div', {'class':'mt-0.5 text-lg font-semibold bp:text-xl sm:mt-1.5 sm:text-2xl'})
        if metrics and len(metrics) > 0:
            cap_text = metrics[0].text.replace(',', '')
            if 'B' in cap_text:
                cap = float(cap_text.replace('B', '')) * 1000
            elif 'M' in cap_text:
                cap = float(cap_text.replace('M', ''))
            else:
                cap = None

            # Revenue in millions
            revenue_text = metrics[1].text.replace(',', '')
            if 'B' in revenue_text:
                revenue = float(revenue_text.replace('B', '')) * 1000
            else:
                revenue = float(revenue_text.replace('M', ''))
                
            # Profits in millions
            profits_text = metrics[2].text.replace(',', '')
            if profits_text == 'n/a':
                profits = None
            elif 'B' in profits_text:
                profits = float(profits_text.replace('B', '')) * 1000
            else:
                profits = float(profits_text.replace('M', ''))
            
            # PE Ratio
            pe_ratio = metrics[3].text.strip().replace(',', '')
            pe_ratio = None if pe_ratio == 'n/a' else float(pe_ratio)
        
            # Profit Margin
            pf_margin = metrics[4].text.strip().replace('%', '')
            pf_margin = None if pf_margin == 'n/a' else float(pf_margin)
        
            # Dividend Yield
            div_yield = metrics[-1].text.replace('%', '').strip()
            div_yield = None if div_yield == 'n/a' else float(div_yield)
        else:
            cap = None
            revenue = None
            profits = None 
            pe_ratio = None
            pf_margin = None
            div_yield = None 
        
        # Create dictionary for entry
        entry_dict = {
            'Industry': [name],
            'Stock Number': [stock_no],
            'Market Cap($M)': [cap],
            'Revenue($M)': [revenue],
            'Profits($M)': [profits],
            'Dividend Yield(%)': [div_yield],
            'PE Ratio': [pe_ratio],
            'Profit Margin(%)': [pf_margin],
            'Day Change(%)': [day_change],
            'Year Change(%)': [y_change],
            'Link': [link]
        }

        # Create DataFrame for the entry and concatenate it to all_df
        df = pd.DataFrame(entry_dict)
        all_df = pd.concat([all_df, df], ignore_index=True)
        all_df['Sector'] = sector_name
        
        # Extract stock data
        stock_table = soup.find('table', {'class':'symbol-table svelte-eurwtr'})
        stock_entries = stock_table.find_all('tr')
        stock_entries = stock_entries[1:]  # Skip header row

        # Initialize lists to store stock data
        symbols = []
        company_names = []
        market_caps = []
        changes = []
        volumes = []
        revenues = []
        
        # Iterate over each stock entry
        for st_item in stock_entries:
            tds = st_item.find_all('td')
            symbol = tds[1].text.strip()
            symbols.append(symbol)
            
            company_name = tds[2].text.strip()
            company_names.append(company_name)
            
            # Market cap
            cap_text = tds[3].text.strip().replace(',', '')
            if 'B' in cap_text:
                market_cap = float(cap_text.replace('B', '')) * 1000
            elif 'K' in cap_text:
                market_cap = float(cap_text.replace('K', '')) / 1000
            else:
                market_cap = float(cap_text.replace('M', ''))
            market_caps.append(market_cap)
            
            # % Change
            pct_change = tds[4].text.strip().replace('%', '').replace(',', '')
            pct_change = None if pct_change == '-' else float(pct_change)
            changes.append(pct_change)
            
            # Stock volume
            volume_text = tds[5].text.strip().replace(',', '')
            volume = None if volume_text == '-' else int(volume_text)
            volumes.append(volume)
            
            # Stock revenue
            revenue_text = tds[6].text.strip()
            if 'B' in revenue_text:
                stock_revenue = float(revenue_text.replace('B', '')) * 1000
            elif 'K' in revenue_text:
                stock_revenue = float(revenue_text.replace('K', '')) / 1000
            elif revenue_text == '-':
                stock_revenue = None 
            else:
                stock_revenue = float(revenue_text.replace('M', ''))
            revenues.append(stock_revenue)
            
        # Create a DataFrame for stock data
        stock_data = {
            'Company Name': company_names,
            'Symbol': symbols,
            'Market Cap($M)': market_caps,
            'Change(%)': changes,
            'Volume': volumes,
            'Revenue($M)': revenues
        }
        
        stock_df = pd.DataFrame(stock_data)
        stock_df['Industry'] = name
        all_stock = pd.concat([all_stock, stock_df], ignore_index=True)
        all_stock['Sector'] = sector_name
    
    # Concatenate all dataframes and add timestamp
    
    # Industry dataframes
    final_df = pd.concat([final_df, all_df], ignore_index=True)

    # Stock dataframes
    final_stock = pd.concat([final_stock, all_stock], ignore_index=True)
    

In [41]:
# Add timestamps
final_df['Time(As of)'] = pd.Timestamp.now().strftime('%Y-%m-%d')
final_stock['Time(As of)'] = pd.Timestamp.now().strftime('%Y-%m-%d')

In [45]:
final_df.tail()

Unnamed: 0,Industry,Stock Number,Market Cap($M),Revenue($M),Profits($M),Dividend Yield(%),PE Ratio,Profit Margin(%),Day Change(%),Year Change(%),Link,Sector,Time(As of)
25,Utilities - Renewable,25,219660.0,85370.0,4650.0,1.47,47.25,5.45,0.88,52.71,https://stockanalysis.com/stocks/industry/util...,Utilities,2024-10-11
26,Utilities - Regulated Gas,16,79400.0,42400.0,5110.0,3.45,15.53,12.06,1.1,18.46,https://stockanalysis.com/stocks/industry/util...,Utilities,2024-10-11
27,Utilities - Regulated Water,14,60330.0,14430.0,2730.0,2.33,22.07,18.95,1.13,14.59,https://stockanalysis.com/stocks/industry/util...,Utilities,2024-10-11
28,Utilities - Diversified,11,108290.0,65940.0,6370.0,3.44,17.0,9.66,0.39,16.52,https://stockanalysis.com/stocks/industry/util...,Utilities,2024-10-11
29,Utilities - Independent Power Producers,6,77780.0,50440.0,4240.0,1.37,18.34,8.41,0.02,190.37,https://stockanalysis.com/stocks/industry/util...,Utilities,2024-10-11


In [46]:
final_df['Sector'].unique()

array(['Real Estate', 'Technology', 'Utilities'], dtype=object)

#### Save data to database

In [47]:
# Save industry data to SQLite Database
conn = sqlite3.connect('data/Marketcap.db')
cursor = conn.cursor()

# Insert to table
final_df.to_sql('Industries', conn, if_exists='append', index=False)

conn.commit()

print('Data successfully added to database table')

# Close the connection
conn.close()

Data successfully added to database table


In [48]:
# Aggregate data by sector

# Convert relevant columns to numeric, forcing errors to NaN
final_df['Profit Margin(%)'] = pd.to_numeric(final_df['Profit Margin(%)'], errors='coerce')
final_df['Day Change(%)'] = pd.to_numeric(final_df['Day Change(%)'], errors='coerce')
final_df['Year Change(%)'] = pd.to_numeric(final_df['Year Change(%)'], errors='coerce')

sect_df = final_df.groupby('Sector').agg({
    'Market Cap($M)': 'sum',
    'Revenue($M)': 'sum',
    'Profits($M)': 'sum',
    'Profit Margin(%)': 'mean',
    'Day Change(%)': 'mean',
    'Year Change(%)': 'mean',
    'Stock Number': 'sum'
}).reset_index()

# Round metrics to 2 decimal places
sect_df = sect_df.round({
    'Market Cap($M)': 1,
    'Revenue($M)': 1,
    'Profits($M)': 2,
    'Profit Margin(%)': 2,
    'Day Change(%)': 2,
    'Year Change(%)': 2
})

sect_df['Time(As of)'] = pd.Timestamp.now().strftime('%Y-%m-%d')

# Save to SQLite Database
conn = sqlite3.connect('data/Marketcap.db')
cursor = conn.cursor()

# Insert to table
sect_df.to_sql('Sectors', conn, if_exists='append', index=False)

conn.commit()

print('Data successfully added to database table')

# Close the connection
conn.close()

Data successfully added to database table


In [49]:
# Save company stock data
conn = sqlite3.connect('data/Marketcap.db')
cursor = conn.cursor()

# Insert to table
final_stock.to_sql('Companies', conn, if_exists='append', index=False)

conn.commit()

print('Data successfully added to database table')

# Close the connection
conn.close()

Data successfully added to database table


#### Explore trends

In [33]:
# Display available metrics
metrics = ['Market Cap($M)', 'Revenue($M)', 'Profits($M)', 'Profit Margin(%)', 'Day Change(%)', 'Year Change(%)']
print("Available metrics to sort by:")
i = 1
for metric in metrics:
    print(f"{i}. {metric}")
    i = i + 1

# Ask user for their choice
user_choice = int(input("Enter the metric you want to sort by: "))
if user_choice > 6:
    print(f"\nInvalid choice")

else:
    choice = metrics[user_choice-1]
    # Sort aggregated data by the chosen metric
    sorted_df = sect_df.sort_values(by=choice, ascending=False)
    sorted_df.reset_index(drop=True, inplace=True)

    # Display the sorted DataFrame
    print(f"\nThese are the top sectors today by {choice}:")
    
# Display the sorted DataFrame
sorted_df

Available metrics to sort by:
1. Market Cap($M)
2. Revenue($M)
3. Profits($M)
4. Profit Margin(%)
5. Day Change(%)
6. Year Change(%)

These are the top sectors today by Profit Margin(%):


Unnamed: 0,Sector,Market Cap($M),Revenue($M),Profits($M),Profit Margin(%),Day Change(%),Year Change(%),Stock Number,Time(As of)
0,Financials,9724100.0,3635120.0,673716.84,14.17,1.42,21.68,945,2024-08-09 14:44
1,Technology,18321890.0,3026140.0,433429.65,10.74,3.31,25.78,783,2024-08-09 14:44
2,Utilities,1499020.0,709950.0,71850.0,10.56,2.03,29.5,110,2024-08-09 14:44
3,Energy,3557410.0,3406980.0,301215.8,10.27,2.18,3.84,249,2024-08-09 14:44
4,Real Estate,1604180.0,342040.0,31156.0,8.72,1.24,4.56,259,2024-08-09 14:44
5,Industrials,5543060.0,2844570.0,201483.43,7.66,2.16,17.97,651,2024-08-09 14:44
6,Consumer Staples,3990800.0,2886650.0,141820.0,7.01,0.89,6.84,248,2024-08-09 14:44
7,Consumer Discretionary,6949180.0,4468350.0,279601.89,5.92,2.52,8.45,576,2024-08-09 14:44
8,Communication Services,5570680.0,1928520.0,197906.39,5.51,1.33,12.09,249,2024-08-09 14:44
9,Materials,2022520.0,1189097.5,76657.42,3.74,2.14,6.46,272,2024-08-09 14:44
