## TASK-3

In [None]:
import csv
import time
import pandas as pd
import re
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException


In [None]:
# --- Function to extract data for 4th structure from a page ---
def extract_company_info(driver):
    data = {}
    
    def safe_find_text(by, selector):
        try:
            return driver.find_element(by, selector).text.strip()
        except NoSuchElementException:
            return ""
    
    # Extract all key-value blocks inside the container
    # Blocks with label and value classes
    
    blocks = driver.find_elements(By.CSS_SELECTOR, "div.block-QCJM7wcY")
    for block in blocks:
        try:
            label = block.find_element(By.CSS_SELECTOR, "div.label-QCJM7wcY").text.strip()
            # The value could be inside an <a> or <div> with class value-QCJM7wcY
            try:
                value = block.find_element(By.CSS_SELECTOR, "a .value-QCJM7wcY").text.strip()
            except NoSuchElementException:
                value = block.find_element(By.CSS_SELECTOR, "div.value-QCJM7wcY").text.strip()
            
            data[label] = value
        except NoSuchElementException:
            continue
    
    # Extract the about/description paragraph (last div with content-H16icEW0)
    try:
        description = driver.find_element(By.CSS_SELECTOR, "div.container-H16icEW0 div.content-H16icEW0 span").text.strip()
        data['Description'] = description
    except NoSuchElementException:
        data['Description'] = ""
    
    return data

# --- Main scraping logic ---
def main():
    # Chrome options to run headless (optional)
    options = Options()
    # options.add_argument("--headless")  # Uncomment if you want no browser UI
    
    driver = webdriver.Chrome(options=options)

    # Read links from CSV file (symbol_link.csv)
    links = []
    with open("symbol_link.csv", "r", newline="", encoding="utf-8") as file:
        reader = csv.reader(file)
        # Assuming the CSV has links in first column; skip header if any
        for row in reader:
            if row and "http" in row[0]:
                links.append(row[0])

    all_data = []
    
    for link in links:
        print(f"Processing: {link}")
        driver.get(link)
        time.sleep(3)  # Wait for page load; adjust as needed

        company_data = extract_company_info(driver)
        company_data['URL'] = link  # Save source link for reference
        all_data.append(company_data)

    driver.quit()

    # Save all extracted data to CSV
    df = pd.DataFrame(all_data)
    df.to_csv("extracted_company_info.csv", index=False, encoding="utf-8")
    print("Scraping done and data saved to extracted_company_info.csv")

if __name__ == "__main__":
    main()


Scraping done and data saved to extracted_company_info.csv


In [5]:
symbol_links_df = pd.read_csv("symbol_link.csv")
exchange_list = symbol_links_df["Exchange"].tolist()
link_list = symbol_links_df["Symbol"].tolist()

print(exchange_list)
print(link_list)

['NYSE', 'NYSE', 'NASDAQ', 'NYSE', 'NASDAQ', 'NYSE', 'NASDAQ', 'NYSE', 'NASDAQ', 'NASDAQ', 'NYSE', 'NASDAQ', 'NASDAQ', 'NYSE', 'NASDAQ', 'NYSE', 'NYSE', 'NASDAQ', 'NASDAQ', 'NYSE', 'NASDAQ', 'NASDAQ', 'NYSE', 'NASDAQ', 'NASDAQ', 'NASDAQ', 'NASDAQ', 'NYSE', 'NASDAQ', 'NASDAQ', 'NASDAQ', 'NASDAQ', 'NYSE', 'NYSE', 'NASDAQ', 'NYSE', 'AMEX', 'NASDAQ', 'NYSE', 'NASDAQ', 'NASDAQ', 'NASDAQ', 'NYSE', 'NASDAQ', 'NYSE', 'NASDAQ', 'NASDAQ', 'NASDAQ', 'NASDAQ', 'NASDAQ', 'NYSE', 'NASDAQ', 'NYSE', 'NASDAQ', 'NASDAQ', 'NASDAQ', 'NASDAQ', 'NASDAQ', 'NASDAQ', 'NASDAQ', 'NASDAQ', 'NASDAQ', 'AMEX', 'NYSE', 'NASDAQ', 'NASDAQ', 'NYSE', 'NASDAQ', 'NASDAQ', 'NASDAQ', 'NASDAQ', 'NASDAQ', 'NYSE', 'NASDAQ', 'NASDAQ', 'NYSE', 'NASDAQ', 'NASDAQ', 'NASDAQ', 'NASDAQ', 'NYSE', 'NASDAQ', 'NASDAQ', 'NASDAQ', 'NASDAQ', 'NASDAQ', 'NASDAQ', 'NYSE', 'NASDAQ', 'NASDAQ', 'NASDAQ', 'NASDAQ', 'NASDAQ', 'NASDAQ', 'NYSE', 'AMEX', 'NASDAQ', 'NYSE', 'NASDAQ', 'NYSE']
['A', 'AA', 'AACB', 'AACT', 'AAL', 'AAM', 'AAME', 'AAMI', 'AAOI

In [11]:
driver = webdriver.Chrome()


In [12]:


def sanitize_text(text):
    # Remove special unicode characters and extra whitespace
    cleaned = re.sub(r'[^\x00-\x7F]+', '', text)  # keep only ASCII
    return cleaned.strip()
company_info = []
for i in range(3):
    url = f"https://www.tradingview.com/symbols/{exchange_list[i]}-{link_list[i]}/"
    driver.get(url)
    print(url)
    time.sleep(3)

    records = driver.find_elements(By.CSS_SELECTOR, ".wrapper-QCJM7wcY")

    complete_single_record = []    
    for record in records:
        children = record.find_elements(By.CLASS_NAME, "apply-overflow-tooltip")

        for j in range(min(23, len(children))):
            raw_text = children[j].text
            cleaned_text = sanitize_text(raw_text)
            company_info.append(cleaned_text)
            complete_single_record.append(company_info)

    

print(complete_single_record)


https://www.tradingview.com/symbols/NYSE-A/
https://www.tradingview.com/symbols/NYSE-AA/
https://www.tradingview.com/symbols/NASDAQ-AACB/
[['In 4 days', 'Q2 2025', '1.26USD', '1.63BUSD', '30.94BUSD', '0.91%', '25.21', '4.37USD', '1.29BUSD', '6.51BUSD', '283.95M', '0.85', '17.9K', '200 1.10%', '363.69KUSD', '72.01KUSD', 'Health Technology', 'Health Technology', 'Medical Specialties', 'Medical Specialties', 'Padraig Mcdonnell', 'agilent.com', 'agilent.com', 'Santa Clara', '1999', 'BBG000C2V3D6', 'July 16', 'Q2 2025', '0.75USD', '2.96BUSD', '7.27BUSD', '1.41%', '8.28', '3.40USD', '60.00MUSD', '12.18BUSD', '256.77M', '1.86', '13.9K', '+300 +2.21%', '876.55KUSD', '4.32KUSD', 'Non-Energy Minerals', 'Non-Energy Minerals', 'Aluminum', 'Aluminum', 'William F. Oplinger', 'alcoa.com', 'alcoa.com', 'Pittsburgh', '1886', 'BBG00B3T3HD3', '295.18MUSD', '', '', '', '85.27KUSD', '0.00USD', '23.47M', '0.04', '1', '0.00USD', '85.27KUSD', 'Finance', 'Finance', 'Financial Conglomerates', 'Financial Conglom

In [None]:
len(complete_single_record[0])

21