# Industries

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from fake_useragent import UserAgent
import random
import sqlite3
import warnings

In [40]:
# Function to get headers with a random user agent
def get_headers():
    ua = UserAgent()
    headers = {
        'User-Agent': ua.random,
        'Accept-Language': 'en-US,en;q=0.9',
        'Accept-Encoding': 'gzip, deflate, br',
        'Connection': 'keep-alive',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'DNT': '1',  # Do Not Track Request Header
        'Upgrade-Insecure-Requests': '1'
    }
    return headers

####  Stock data extraction

In [14]:
# Base URL for stock analysis by industry
url = 'https://stockanalysis.com/stocks/industry/'
headers = get_headers()

# Send request to the URL
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.text, 'lxml')

# Extract sector titles
sectors = soup.find_all('div', {'class':'title-group'})
sector_titles = [title.text.split(':')[1].strip() for title in sectors]

# Extract industry tables
sector_div = soup.find('div', {'class':'space-y-8'})
tables = sector_div.find_all('table', {'class':'svelte-qmv8b3'})

# Extract entries from tables
entries = []
for table in tables:
    industries = table.find_all('tr', {'class':'svelte-qmv8b3'})
    industries = industries[1:]  # Skip the header row
    entries.append(industries)

# Verify lengths of sector titles and entries
len(sector_titles), len(entries)

(11, 11)

In [30]:
# List of sector titles and their corresponding entries
industry_items = list(zip(sector_titles, entries))

# Turn off FutureWarnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Initialize master dataframes to store final results
final_df = pd.DataFrame()
final_stock = pd.DataFrame()

# Iterate over each sector and its entries  
for sector_name, sector_entry in industry_items:
    
    # Initialize dataframes for the current sector
    all_df = pd.DataFrame()
    all_stock = pd.DataFrame()
    
    # Iterate over each entry in the current sector
    for entry in sector_entry:
        # Extract link, day & year change for each entry 
        tds = entry.find_all('td')
        ind_url = tds[0].find('a')['href']
        day_change = tds[-2].text.strip().replace('%', '')
        y_change = tds[-1].text.strip().replace('%', '')
        
        # Construct the full link to the industry details page
        link = f'https://stockanalysis.com{ind_url}'
        
        # Get headers with a random user agent
        headers = get_headers()  
        
        # Send request to the industry details page
        r = requests.get(link, headers=headers)
        soup = BeautifulSoup(r.text, 'lxml')
        
        # Extract industry name
        name = soup.find('h1').text.strip()    
        
        # Extract stock number
        stock_no = soup.find('div', {'class':'title-group'}).text.strip()
        stock_no = int(stock_no.split()[0])
        
        # Extract financial metrics
        metrics = soup.find_all('div', {'class':'mt-0.5 text-lg font-semibold bp:text-xl sm:mt-1.5 sm:text-2xl'})

        # Market cap in millions
        cap_text = metrics[0].text.replace(',', '')
        if 'B' in cap_text:
            cap = float(cap_text.replace('B', '')) * 1000
        else:
            cap = float(cap_text.replace('M', ''))

        # Revenue in millions
        revenue_text = metrics[1].text.replace(',', '')
        if 'B' in revenue_text:
            revenue = float(revenue_text.replace('B', '')) * 1000
        else:
            revenue = float(revenue_text.replace('M', ''))

        # Profits in millions
        profits_text = metrics[2].text.replace(',', '')
        if profits_text == 'n/a':
            profits = None
        elif 'B' in profits_text:
            profits = float(profits_text.replace('B', '')) * 1000
        else:
            profits = float(profits_text.replace('M', ''))
        
        # PE Ratio
        pe_ratio = metrics[3].text.strip().replace(',', '')
        pe_ratio = None if pe_ratio == 'n/a' else float(pe_ratio)
        
        # Profit Margin
        pf_margin = metrics[4].text.strip().replace('%', '')
        pf_margin = None if pf_margin == 'n/a' else float(pf_margin)
        
        # Dividend Yield
        div_yield = metrics[-1].text.replace('%', '').strip()
        div_yield = None if div_yield == 'n/a' else float(div_yield)
        
        # Create dictionary for entry
        entry_dict = {
            'Industry': [name],
            'Stock Number': [stock_no],
            'Market Cap($M)': [cap],
            'Revenue($M)': [revenue],
            'Profits($M)': [profits],
            'Dividend Yield(%)': [div_yield],
            'PE Ratio': [pe_ratio],
            'Profit Margin(%)': [pf_margin],
            'Day Change(%)': [day_change],
            'Year Change(%)': [y_change],
            'Link': [link]
        }

        # Create DataFrame for the entry and concatenate it to all_df
        df = pd.DataFrame(entry_dict)
        all_df = pd.concat([all_df, df], ignore_index=True)
        all_df['Sector'] = sector_name
        
        # Extract stock data
        stock_table = soup.find('table', {'class':'symbol-table svelte-eurwtr'})
        stock_entries = stock_table.find_all('tr')
        stock_entries = stock_entries[1:]  # Skip header row

        # Initialize lists to store stock data
        symbols = []
        company_names = []
        market_caps = []
        changes = []
        volumes = []
        revenues = []
        
        # Iterate over each stock entry
        for st_item in stock_entries:
            tds = st_item.find_all('td')
            symbol = tds[1].text.strip()
            symbols.append(symbol)
            
            company_name = tds[2].text.strip()
            company_names.append(company_name)
            
            # Market cap
            cap_text = tds[3].text.strip().replace(',', '')
            if 'B' in cap_text:
                market_cap = float(cap_text.replace('B', '')) * 1000
            elif 'K' in cap_text:
                market_cap = float(cap_text.replace('K', '')) / 1000
            else:
                market_cap = float(cap_text.replace('M', ''))
            market_caps.append(market_cap)
            
            # % Change
            pct_change = tds[4].text.strip().replace('%', '')
            pct_change = None if pct_change == '-' else float(pct_change)
            changes.append(pct_change)
            
            # Stock volume
            volume_text = tds[5].text.strip().replace(',', '')
            volume = None if volume_text == '-' else int(volume_text)
            volumes.append(volume)
            
            # Stock revenue
            revenue_text = tds[6].text.strip()
            if 'B' in revenue_text:
                stock_revenue = float(revenue_text.replace('B', '')) * 1000
            elif 'K' in revenue_text:
                stock_revenue = float(revenue_text.replace('K', '')) / 1000
            elif revenue_text == '-':
                stock_revenue = None 
            else:
                stock_revenue = float(revenue_text.replace('M', ''))
            revenues.append(stock_revenue)
            
        # Create a DataFrame for stock data
        stock_data = {
            'Company Name': company_names,
            'Symbol': symbols,
            'Market Cap($M)': market_caps,
            'Change(%)': changes,
            'Volume': volumes,
            'Revenue($M)': revenues
        }
        
        stock_df = pd.DataFrame(stock_data)
        stock_df['Industry'] = name
        all_stock = pd.concat([all_stock, stock_df], ignore_index=True)
        all_stock['Sector'] = sector_name
    
    # Concatenate all dataframes and add timestamp
    
    # Industry dataframes
    final_df = pd.concat([final_df, all_df], ignore_index=True)
    final_df['Time(As of)'] = pd.Timestamp.now().strftime('%Y-%m-%d %H:%M')
    
    # Stock dataframes
    final_stock = pd.concat([final_stock, all_stock], ignore_index=True)
    final_stock['Time(As of)'] = pd.Timestamp.now().strftime('%Y-%m-%d %H:%M')

#### Save data to database

In [33]:
# Save industry data to SQLite Database
conn = sqlite3.connect('data/Marketcap.db')
cursor = conn.cursor()

# Insert to table
final_df.to_sql('Industries', conn, if_exists='append', index=False)

conn.commit()

print('Data successfully added to database table')

# Close the connection
conn.close()

Data successfully added to database table


In [35]:
# Aggregate data by sector

# Convert relevant columns to numeric, forcing errors to NaN
final_df['Profit Margin(%)'] = pd.to_numeric(final_df['Profit Margin(%)'], errors='coerce')
final_df['Day Change(%)'] = pd.to_numeric(final_df['Day Change(%)'], errors='coerce')
final_df['Year Change(%)'] = pd.to_numeric(final_df['Year Change(%)'], errors='coerce')

sect_df = final_df.groupby('Sector').agg({
    'Market Cap($M)': 'sum',
    'Revenue($M)': 'sum',
    'Profits($M)': 'sum',
    'Profit Margin(%)': 'mean',
    'Day Change(%)': 'mean',
    'Year Change(%)': 'mean',
    'Stock Number': 'sum'
}).reset_index()

# Round metrics to 2 decimal places
sect_df = sect_df.round({
    'Market Cap($M)': 1,
    'Revenue($M)': 1,
    'Profits($M)': 2,
    'Profit Margin(%)': 2,
    'Day Change(%)': 2,
    'Year Change(%)': 2
})

sect_df['Time(As of)'] = pd.Timestamp.now().strftime('%Y-%m-%d %H:%M')

# Save to SQLite Database
conn = sqlite3.connect('data/Marketcap.db')
cursor = conn.cursor()

# Insert to table
sect_df.to_sql('Sectors', conn, if_exists='append', index=False)

conn.commit()

print('Data successfully added to database table')

# Close the connection
conn.close()

Data successfully added to database table


In [37]:
# Save company stock data
conn = sqlite3.connect('data/Marketcap.db')
cursor = conn.cursor()

# Insert to table
final_stock.to_sql('Companies', conn, if_exists='replace', index=False)

conn.commit()

print('Data successfully added to database table')

# Close the connection
conn.close()

Data successfully added to database table


#### Explore trends

In [36]:
# Display available metrics
metrics = ['Market Cap($M)', 'Revenue($M)', 'Profits($M)', 'Profit Margin(%)', 'Day Change(%)', 'Year Change(%)']
print("Available metrics to sort by:")
i = 1
for metric in metrics:
    print(f"{i}. {metric}")
    i = i + 1

# Ask user for their choice
user_choice = int(input("Enter the metric you want to sort by: "))
if user_choice > 6:
    print(f"\nInvalid choice")

else:
    choice = metrics[user_choice-1]
    # Sort aggregated data by the chosen metric
    sorted_df = sect_df.sort_values(by=choice, ascending=False)
    sorted_df.reset_index(drop=True, inplace=True)

    # Display the sorted DataFrame
    print(f"\nThese are the top sectors today by {choice}:")
    
# Display the sorted DataFrame
sorted_df

Available metrics to sort by:
1. Market Cap($M)
2. Revenue($M)
3. Profits($M)
4. Profit Margin(%)
5. Day Change(%)
6. Year Change(%)

These are the top sectors today by Revenue($M):


Unnamed: 0,Sector,Market Cap($M),Revenue($M),Profits($M),Profit Margin(%),Day Change(%),Year Change(%),Stock Number,Time(As of)
0,Consumer Discretionary,7408710.0,4458030.0,271980.85,5.09,1.28,10.55,578,2024-08-01 00:07
1,Healthcare,8443980.0,4002200.0,130391.24,0.92,0.37,11.06,1181,2024-08-01 00:07
2,Financials,10427150.0,3864600.0,677296.53,13.79,0.59,26.6,951,2024-08-01 00:07
3,Energy,3730050.0,3479590.0,333229.84,11.05,1.96,11.75,250,2024-08-01 00:07
4,Technology,19456590.0,3003710.0,408212.15,9.78,3.61,30.92,787,2024-08-01 00:07
5,Consumer Staples,3961450.0,2900970.0,146179.09,7.33,-0.11,6.24,251,2024-08-01 00:07
6,Industrials,5790270.0,2825170.0,215505.06,7.55,1.18,24.24,652,2024-08-01 00:07
7,Communication Services,5688450.0,1906420.0,196383.67,3.89,0.84,12.66,250,2024-08-01 00:07
8,Materials,2134690.0,1168828.0,77880.63,4.72,2.41,11.36,271,2024-08-01 00:07
9,Utilities,1496830.0,665540.0,61580.0,9.71,3.06,27.85,107,2024-08-01 00:07
