### Company profiles

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from fake_useragent import UserAgent
import random
from pymongo import MongoClient

In [None]:
# Function to get headers with a random user agent
def get_headers():
    ua = UserAgent()
    headers = {
        'User-Agent': ua.random,
        'Accept-Language': 'en-US,en;q=0.9',
        'Accept-Encoding': 'gzip, deflate, br',
        'Connection': 'keep-alive',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'DNT': '1',  # Do Not Track Request Header
        'Upgrade-Insecure-Requests': '1'
    }
    return headers
 

# Function to convert values
def convert_values(value):
    if isinstance(value, float):
        return value  # If the value is already a float, return it as is
    str_value = value.strip()  # Remove any leading/trailing whitespace
    if 'B' in str_value:
        return float(str_value.replace('B', ''))
    elif 'T' in str_value:
        return float(str_value.replace('T', '')) * 1000  # Convert trillions to billions
    elif 'M' in str_value:
        return float(str_value.replace('M', '')) / 1000  # Convert millions to billions
    else:
        raise ValueError("Unknown format")


: 

In [None]:
# Query SQLite database to get stock info for sector
conn = sqlite3.connect('data/Marketcap.db')
cursor = conn.cursor()

# Get the table
table_name = 'Companies'
cursor.execute(f"SELECT * FROM {table_name}")
result = cursor.fetchall()

# Convert the result to a pandas DataFrame
df = pd.DataFrame(result, columns=[description[0] for description in cursor.description])

# Display available sectors
print("Available sectors:")
sectors = df['Sector'].unique()
i = 1
for sector in sectors:
    print(f"{i}. {sector}")
    i += 1

# Extract data from the 'Sector' column that contains user input
user_input = int(input("Enter the sector you want to extract: "))
choice = sectors[user_input-1]
query = f"SELECT * FROM {table_name} WHERE Sector = ?"
cursor.execute(query, (choice,))
result = cursor.fetchall()

# Close the connection
conn.close()

# Turn off warnings
warnings.simplefilter(action='ignore', category=DeprecationWarning)

# Convert the result to a pandas DataFrame and sort dataframe alphabetically
df = pd.DataFrame(result, columns=[description[0] for description in cursor.description])
df = df.sort_values(by='Company Name', ascending=True)

# Normalize the 'Market Cap($M)' and 'Revenue($M)' columns
scaler = MinMaxScaler()
df[['Market Cap(nm)', 'Revenue(nm)']] = scaler.fit_transform(df[['Market Cap($M)', 'Revenue($M)']])

# Create a combined score based on normalized market cap and revenue
df['Combined Score'] = df['Market Cap(nm)'] + df['Revenue(nm)']


# Get the top 20 companies per industries based on combined relative score
top_combined = df.groupby('Industry').apply(lambda x: x.nlargest(20, 'Combined Score')).reset_index(drop=True)
top_combined = top_combined.drop_duplicates().reset_index(drop=True)

# Drop the 'Market Cap(nm)', 'Revenue(nm)', and 'Combined Score' columns
top_combined = top_combined.drop(['Market Cap(nm)', 'Revenue(nm)', 'Combined Score'], axis=1)
 
df_scrape = top_combined.copy()
df_scrape = df_scrape.sort_values(by='Revenue($M)', ascending=False)
tickers = df_scrape['Symbol'].tolist()
len(tickers)

In [None]:
base = "https://www.google.com/finance"
lang = "en"

# List to store all stock descriptions
all_stock_descriptions = []

for symbol in tickers:
    index = "NASDAQ"
    url = f"{base}/quote/{symbol}:{index}?hl={lang}"
    headers = get_headers()
    page = requests.get(url, headers=headers)
    soup = BeautifulSoup(page.text, 'lxml') 
    head = soup.find('div', {'class':'zzDege'})
    
    if not head:
        index = "NYSE"
        url = f"{base}/quote/{symbol}:{index}?hl={lang}"
        headers = get_headers()
        page = requests.get(url, headers=headers)
        soup = BeautifulSoup(page.text, "lxml")
        

    # Extract stock info
    stock_description = {}
    stock_description['Name'] = soup.find('div', {'class':'zzDege'}).text.strip()
    stock_description['Symbol'] = symbol
    about_txt = soup.find('div', {'class':'bLLb2d'})
    about = about_txt.text.strip() if about_txt else None
    stock_description['About'] = about

    items = soup.find_all("div", {"class": "gyFHrc"})
    for item in items:
        item_description = item.find("div", {"class": "mfs7Fc"}).text
        item_value = item.find("div", {"class": "P6K39c"}).text
        stock_description[item_description] = item_value

    # Get data from company financial sheets
    tables = soup.find_all('table', {'class':'slpEwd'})

    # Company's income sheet
    income = tables[0]

    fin_entries = income.find_all('tr')
    fin_entries = fin_entries[1:]

    income_info = {}
    for item in fin_entries:
        item_title = item.find("div", {"class": "rsPbEe"}).text
        item_description = item.find("div", {"class": "EY8ABd-OWXEXe-TAWMXe"}).text
        income_info[item_title] = item_description 
        income_info[f'{item_title} value'] = item.find('td', {'class':'QXDnM'}).text.strip()
        change = item.find('td', {'class':'gEUVJe'}).text.replace('%', '').replace(',', '')
        change = None if change == '—' else float(change)
        income_info[f'{item_title} Y/Y change(%)'] = change
        
    stock_description['Income'] = income_info

    # Company's balance sheet
    bal_sheet = tables[1]
    bal_entries = bal_sheet.find_all('tr')
    bal_entries = bal_entries[1:]

    balance_info = {}
    for item in bal_entries:
        item_title = item.find("div", {"class": "rsPbEe"}).text
        item_description = item.find("div", {"class": "EY8ABd-OWXEXe-TAWMXe"}).text
        balance_info[item_title] = item_description 
        balance_info[f'{item_title} value'] = item.find('td', {'class':'QXDnM'}).text.strip()
        change = item.find('td', {'class':'gEUVJe'}).text.replace('%', '').replace(',', '')
        change = None if change == '—' else float(change)  
        balance_info[f'{item_title} Y/Y change(%)'] = change
        
    stock_description['Balance Sheet'] = balance_info

    # Company's cash flow
    cash_sheet = tables[2]
    cash_entries = cash_sheet.find_all('tr')
    cash_entries = cash_entries[1:]

    cash_info = {}
    for item in cash_entries:
        item_title = item.find("div", {"class": "rsPbEe"}).text
        item_description = item.find("div", {"class": "EY8ABd-OWXEXe-TAWMXe"}).text
        cash_info[item_title] = item_description 
        cash_info[f'{item_title} value'] = item.find('td', {'class':'QXDnM'}).text.strip()
        change = item.find('td', {'class':'gEUVJe'}).text.replace('%', '').replace(',', '')
        change = None if change == '—' else float(change)  
        cash_info[f'{item_title} Y/Y change(%)'] = change
        
    stock_description['Cash Flow'] = cash_info
    
    # Convert values to appropriate data types
    stock_description['Employees'] = int(stock_description['Employees'].replace(',', ''))
    stock_description['Previous close'] = float(stock_description['Previous close'].replace('$', ''))
    stock_description['Market cap'] = convert_values(stock_description['Market cap'].replace('USD', '').strip())
    
    
    # Append the stock description to the list
    all_stock_descriptions.append(stock_description)

### Key metrics

#### Income Statement
1. **Revenue**: - Indicates the company's total income from its operations.
2. **Operating Expense**: - Reflects the costs required to run the company’s core business.
3. **Net Income**: - Represents the company's profit after all expenses have been deducted from revenues.
4. **Net Profit Margin**: - Shows the percentage of revenue that remains as profit after all expenses.
5. **Earnings Per Share (EPS)**: - Measures the profitability on a per-share basis.
6. **EBITDA**: - Earnings before interest, taxes, depreciation, and amortization, indicating operational profitability.
7. **Effective Tax Rate**:  - The average rate at which the company’s pre-tax profits are taxed.

#### Balance Sheet
1. **Cash and Short-term Investments**: - Indicates the liquidity and available capital for operations and investments.
2. **Total Assets**: - Represents the total resources owned by the company.
3. **Total Liabilities**:- Reflects the company's total obligations.
4. **Total Equity**: - Indicates the net worth of the company.
5. **Shares Outstanding**: - Represents the total number of shares currently held by all shareholders.
6. **Price to Book (P/B) Ratio**: - Compares the company's market value to its book value.
7. **Return on Assets (ROA)**: - Measures how efficiently the company is using its assets to generate profit.
8. **Return on Capital (ROC)**: - Indicates the return the company is generating on its invested capital.

#### Cash Flow Statement
1. **Cash from Operations**: - Represents the cash generated from the company's core business operations.
2. **Cash from Investing**: - Indicates cash spent on investments like equipment, acquisitions, etc.
3. **Cash from Financing**: - Reflects cash flows related to borrowing and repaying debt, issuing stock, etc.
4. **Net Change in Cash**:  - Shows the overall change in the company's cash position.
5. **Free Cash Flow (FCF)**: - Indicates the cash available after capital expenditures for the company to expand, pay dividends, etc.
