### Company profiles

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import random
import sqlite3
from sklearn.preprocessing import MinMaxScaler
import warnings
import re
import datetime
import json 
from pymongo import MongoClient
import os, sys
sys.path.append(os.path.abspath('../functions'))
from common_functions import *

In [2]:
# Query SQLite database to get stock info for sector
conn = sqlite3.connect('C:/Users/user\Desktop/Repositories/Finance-Analytics/data/Marketcap.db')
cursor = conn.cursor()

# Get the table
table_name = 'Companies'
cursor.execute(f"SELECT * FROM {table_name}")
result = cursor.fetchall()

# Convert the result to a pandas DataFrame
df = pd.DataFrame(result, columns=[description[0] for description in cursor.description])

# Display available sectors
print("Available sectors:")
sectors = df['Sector'].unique()
i = 1
for sector in sectors:
    print(f"{i}. {sector}")
    i += 1

# Extract data from the 'Sector' column that contains user input
user_input = int(input("Enter the sector you want to extract: "))
company_sector = sectors[user_input-1]
query = f"SELECT * FROM {table_name} WHERE Sector = ?"
cursor.execute(query, (company_sector,))
result = cursor.fetchall()

# Close the connection
conn.close()

# Turn off warnings
warnings.simplefilter(action='ignore', category=DeprecationWarning)

# Convert the result to a pandas DataFrame and sort dataframe alphabetically
return_df = pd.DataFrame(result, columns=[description[0] for description in cursor.description])
return_df = return_df.sort_values(by='Company Name', ascending=True)


# Normalize the 'Market Cap($M)' and 'Revenue($M)' columns
scaler = MinMaxScaler()
return_df[['Market Cap(nm)', 'Revenue(nm)']] = scaler.fit_transform(return_df[['Market Cap($M)', 'Revenue($M)']])

# Create a combined score based on normalized market cap and revenue
return_df['Combined Score'] = return_df['Market Cap(nm)'] + return_df['Revenue(nm)']


# Get the top 10 companies per industries based on combined relative score
top_combined = return_df.groupby('Industry').apply(lambda x: x.nlargest(10, 'Combined Score')).reset_index(drop=True)
top_combined = top_combined.drop_duplicates().reset_index(drop=True)

# Drop the 'Market Cap(nm)', 'Revenue(nm)', and 'Combined Score' columns
top_combined = top_combined.drop(['Market Cap(nm)', 'Revenue(nm)', 'Combined Score'], axis=1)
 
# Sort the dataframe by 'Revenue($M)' in descending order and reset the index
df_scrape = top_combined.copy()
df_scrape = df_scrape.sort_values(by='Revenue($M)', ascending=False).reset_index(drop=True)


# Extract the tickers, company names and industries from  df_scrape
tickers = df_scrape['Symbol'].tolist()
company_names = df_scrape['Company Name'].tolist()
industries = df_scrape['Industry'].tolist()

# Construct stock items
stock_items = list(zip(tickers, company_names, industries))

print(f"\nGenerated {len(stock_items)} stock items for the {company_sector} sector.")

  conn = sqlite3.connect('C:/Users/user\Desktop/Repositories/Finance-Analytics/data/Marketcap.db')


Available sectors:
1. Communication Services
2. Consumer Discretionary
3. Consumer Staples
4. Energy
5. Financials
6. Healthcare
7. Industrials
8. Materials
9. Real Estate
10. Technology
11. Utilities

Generated 68 stock items for the Communication Services sector.


In [3]:
stock_items[:10]

[('GOOGL', 'Alphabet Inc.', 'Internet Content & Information'),
 ('META', 'Meta Platforms, Inc.', 'Internet Content & Information'),
 ('VZ', 'Verizon Communications Inc.', 'Telecom Services'),
 ('T', 'AT&T Inc.', 'Telecom Services'),
 ('CMCSA', 'Comcast Corporation', 'Telecom Services'),
 ('DIS', 'The Walt Disney Company', 'Entertainment'),
 ('TMUS', 'T-Mobile US, Inc.', 'Telecom Services'),
 ('CHTR', 'Charter Communications, Inc.', 'Telecom Services'),
 ('ORAN', 'Orange S.A.', 'Telecom Services'),
 ('AMX', 'AmÃ©rica MÃ³vil, S.A.B. de C.V.', 'Telecom Services')]

In [5]:
base = "https://www.google.com/finance"
lang = "en"

# Connect to MongoDB
client = MongoClient('mongodb://localhost:27017/')
db = client['Finance']

# Function to get the collection based on sector
def get_collection_by_sector(company_sector):
    return db[company_sector]

# Loop through the stock items and extract the data
for ticker, company_name, industry in stock_items[3:30]:
    index = "NASDAQ"
    url = f"{base}/quote/{ticker}:{index}?hl={lang}"
    headers = get_headers()
    page = requests.get(url, headers=headers)
    
    soup = BeautifulSoup(page.text, 'lxml') 
    head = soup.find('div', {'class':'zzDege'})
    
    # If not found on NASDAQ, try NYSE
    if not head:
        index = "NYSE"
        url = f"{base}/quote/{ticker}:{index}?hl={lang}"
        headers = get_headers()
        page = requests.get(url, headers=headers)
        
        soup = BeautifulSoup(page.text, "lxml")
        head = soup.find('div', {'class':'zzDege'})
        
        if not head:
            index = 'NYSEAMERICAN'
            url = f"{base}/quote/{ticker}:{index}?hl={lang}"
            headers = get_headers()
            page = requests.get(url, headers=headers)
            
            soup = BeautifulSoup(page.text, "lxml")
            head = soup.find('div', {'class':'zzDege'})
            
            if not head:
                print(f"Could not find stock info for {company_name} ({ticker}).\n")
                continue

    # Extract stock info
    stock_description = {}
    stock_description['Name'] = soup.find('div', {'class':'zzDege'}).text.strip()
    stock_description['Symbol'] = ticker
    stock_description['Industry'] = industry
    about_txt = soup.find('div', {'class':'bLLb2d'})
    about = about_txt.text.strip().replace(' Wikipedia', '') if about_txt else None
    stock_description['About'] = about
    
    
    # Stock summaries
    items = soup.find_all("div", {"class": "gyFHrc"})
    for item in items:
        item_description = item.find("div", {"class": "mfs7Fc"}).text
        item_value = item.find("div", {"class": "P6K39c"}).text
        
        # Change 'Market Cap' to 'Market Cap($B)' and convert value
        if item_description == 'Market cap':
            item_description = 'Market Cap($B)'
            item_value = convert_values(item_value)
        
        # Convert average stock volume to millions  
        elif item_description == 'Avg Volume':
            item_description = 'Avg Volume(M)'
            item_value = convert_values(item_value) 
            item_value = round(item_value * 1000, 2)
        
        elif item_description == 'P/E ratio':
            if item_value not in ['—', '-']:
                item_value = float(item_value)
            else:
                item_value = None
            

        
        elif item_description == 'Dividend yield':
            item_description = 'Dividend yield (%)'
            if item_value not in ['—', '-']:
                item_value = float(item_value.replace('%', '').replace(',', '')) 
            else:
                item_value = None
            
        stock_description[item_description] = item_value
    
    # Convert values to appropriate data types
    if 'Employees' in stock_description:
        stock_description['Employees'] = int(stock_description['Employees'].replace(',', ''))
    
    stock_description['Previous close'] = float(stock_description['Previous close'].replace('$', '').replace(',', ''))

    # Get data from company financial sheets
    tables = soup.find_all('table', {'class':'slpEwd'})
    if tables is None:
        print(f"Could not find financial data for {company_name} ({ticker}).\n")
        continue
    # Company's income sheet
    if len(tables) < 3:
        print(f"Could not find financial data for {company_name} ({ticker}).\n")
        continue
    
    income = tables[0]
    fin_entries = income.find_all('tr')[1:]

    inc_items = ['Revenue', 'Net income', 'Operating expense', 'EBIDTA']
    
    income_info = {}
    for item in fin_entries:
        item_title = item.find("div", {"class": "rsPbEe"}).text
        item_description = item.find("div", {"class": "EY8ABd-OWXEXe-TAWMXe"}).text
        income_info[item_title] = item_description
        item_value = item.find('td', {'class':'QXDnM'}).text.strip()
        
        # Change specific financial terms and convert values
        if item_title in inc_items:
            income_info[f'{item_title} ($B)'] = convert_values(item_value) if item_value != '—' else None

        elif item_title == 'Net profit margin':
            income_info[f'{item_title} (%)'] = float(item_value) if item_value != '—' else None  
     
        elif item_title == 'Earnings per share':
            if 'K' in item_value:
                income_info[f'{item_title} value'] =  item_value
            else:
                income_info[f'{item_title} ($)'] =  float(item_value) if item_value != '—' else None
        
        elif item_title == 'Effective tax rate':
            income_info[f'{item_title} (%)'] = float(item_value.replace('%', '').replace(',', '')) if item_value != '—' else None
        
        # Handle year-over-year change
        change = item.find('td', {'class':'gEUVJe'}).text.replace('%', '').replace(',', '')
        change = None if change == '—' else float(change)  
        income_info[f'{item_title} Y/Y change(%)'] = change
        
    stock_description['Income'] = income_info

    # Company's balance sheet
    bal_sheet = tables[1]
    bal_entries = bal_sheet.find_all('tr')[1:]
    
    bal_items = ['Cash and short-term investments', 'Total assets', 'Total liabilities', 'Total equity']
    
    balance_info = {}
    for item in bal_entries:
        item_title = item.find("div", {"class": "rsPbEe"}).text
        item_description = item.find("div", {"class": "EY8ABd-OWXEXe-TAWMXe"}).text
        balance_info[item_title] = item_description 
        item_value = item.find('td', {'class':'QXDnM'}).text.strip()
        
        # Handle specific financial terms and convert values
        if item_title in bal_items:
            balance_info[f'{item_title} ($B)'] = convert_values(item_value) if item_value != '—' else None       
            
        elif item_title == 'Shares outstanding':
            balance_info[f'{item_title} (B)'] = convert_values(item_value) if item_value != '—' else None
            
        elif item_title == 'Price to book':
            balance_info[f'{item_title}'] = float(item_value.replace('%', '').replace(',', '')) if item_value != '—' else None
            
        elif item_title == 'Return on assets':
            balance_info[f'{item_title} (%)'] = float(item_value.replace('%', '').replace(',', '')) if item_value != '—' else None
            
        elif item_title == 'Return on capital':
            balance_info[f'{item_title} (%)'] = float(item_value.replace('%', '').replace(',', '')) if item_value != '—' else None
        
        # Handle year-over-year change
        change = item.find('td', {'class':'gEUVJe'}).text.replace('%', '').replace(',', '')
        change = None if change == '—' else float(change)
        balance_info[f'{item_title} Y/Y change(%)'] = change    
        
    stock_description['Balance Sheet'] = balance_info

    # Company's cash flow
    cash_sheet = tables[2]
    cash_entries = cash_sheet.find_all('tr')[2:]

    cash_items = ['Cash from operations', 'Cash from investing', 
                  'Cash from financing', 
                  'Net change in cash', 'Free cash flow']
    
    cash_info = {}
    for item in cash_entries:
        item_title = item.find("div", {"class": "rsPbEe"}).text
        item_description = item.find("div", {"class": "EY8ABd-OWXEXe-TAWMXe"}).text
        cash_info[item_title] = item_description 
        item_value = item.find('td', {'class':'QXDnM'}).text.strip()
        
        # Handle specific financial terms and convert values
        if item_title in cash_items:
            cash_info[f'{item_title} ($B)'] = convert_values(item_value) if item_value != '—' else None
            
        # Handle year-over-year change
        change = item.find('td', {'class':'gEUVJe'}).text.replace('%', '').replace(',', '')
        change = None if change == '—' else float(change)  
        cash_info[f'{item_title} Y/Y change(%)'] = change
        
    stock_description['Cash Flow'] = cash_info
    
    # Get top news leads 
    news_items = soup.find_all('div', {'class':'yY3Lee'})
    news_items = news_items[:5]

    news_leads = {}
    for n_item in news_items:
        news_title = n_item.find('div', {'class':'Yfwt5'}).text.strip()
        news_url = n_item.find('a')['href']
        lead = f'{news_title}: {news_url}'
        news_leads[news_title] = news_url
    
    stock_description['News Leads'] = news_leads
    
    # Add current time as 'As of' value

    now = datetime.datetime.now()
    stock_description['Time(As of)'] = now.strftime("%Y-%m-%d %H:%M:%S")
    

    
    # Insert the stock description into the appropriate collection
    collection = get_collection_by_sector(company_sector)
    collection.insert_one(stock_description)

Could not find stock info for Orange S.A. (ORAN).

Could not find financial data for NetEase, Inc. (NTES).

Could not find financial data for Sirius XM Holdings Inc. (SIRI).



### Key metrics

#### Income Statement
1. **Revenue**: - Indicates the company's total income from its operations.
2. **Operating Expense**: - Reflects the costs required to run the company’s core business.
3. **Net Income**: - Represents the company's profit after all expenses have been deducted from revenues.
4. **Net Profit Margin**: - Shows the percentage of revenue that remains as profit after all expenses.
5. **Earnings Per Share (EPS)**: - Measures the profitability on a per-share basis.
6. **EBITDA**: - Earnings before interest, taxes, depreciation, and amortization, indicating operational profitability.
7. **Effective Tax Rate**:  - The average rate at which the company’s pre-tax profits are taxed.

#### Balance Sheet
1. **Cash and Short-term Investments**: - Indicates the liquidity and available capital for operations and investments.
2. **Total Assets**: - Represents the total resources owned by the company.
3. **Total Liabilities**:- Reflects the company's total obligations.
4. **Total Equity**: - Indicates the net worth of the company.
5. **Shares Outstanding**: - Represents the total number of shares currently held by all shareholders.
6. **Price to Book (P/B) Ratio**: - Compares the company's market value to its book value.
7. **Return on Assets (ROA)**: - Measures how efficiently the company is using its assets to generate profit.
8. **Return on Capital (ROC)**: - Indicates the return the company is generating on its invested capital.

#### Cash Flow Statement
1. **Cash from Operations**: - Represents the cash generated from the company's core business operations.
2. **Cash from Investing**: - Indicates cash spent on investments like equipment, acquisitions, etc.
3. **Cash from Financing**: - Reflects cash flows related to borrowing and repaying debt, issuing stock, etc.
4. **Net Change in Cash**:  - Shows the overall change in the company's cash position.
5. **Free Cash Flow (FCF)**: - Indicates the cash available after capital expenditures for the company to expand, pay dividends, etc.
