In [3]:
import pandas as pd
import requests
import json
from bs4 import BeautifulSoup


In [6]:
!pip install pyarrow

Collecting pyarrow
  Downloading pyarrow-19.0.0-cp312-cp312-macosx_12_0_arm64.whl.metadata (3.3 kB)
Downloading pyarrow-19.0.0-cp312-cp312-macosx_12_0_arm64.whl (30.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: pyarrow
Successfully installed pyarrow-19.0.0


In [35]:
def download_scanner_columns():
    # URL of the webpage
    url = 'https://shner-elmo.github.io/TradingView-Screener/fields/stocks.html'
    response = requests.get(url)
    response.raise_for_status()
    soup = BeautifulSoup(response.text, 'html.parser')
    table = soup.find('table')
    # Initialize a list to store extracted data
    columns = []
    for row in table.tbody.find_all('tr'):
        cells = row.find_all('td')
        if len(cells) == 3:
            # Extract common fields
            display_name = cells[1].get_text(strip=True)
            column_type = cells[2].get_text(strip=True)

            # Check if the first cell contains <details>
            details = cells[0].find('details')
            if details:
                # Extract each <li> inside <ul> as Column Name
                column_names = [li.get_text(strip=True) for li in details.find_all('li')]
            else:
                # Otherwise, use the text inside <td> directly
                column_names = [cells[0].get_text(strip=True)]

            # Store extracted data
            for column_name in column_names:
                columns.append({
                    'Column Name': column_name,
                    'Display Name': display_name,
                    'Type': column_type
                })
        else:
            print(f"Unexpected row format: {row}")

    pd.DataFrame(columns).to_parquet('../data/scanner/cols.parquet', compression='zstd')
    print("Saved scanner column info")
    return columns


In [36]:
def fetch_scanner_data(market: str, cols: list[str]):
    url = f"https://scanner.tradingview.com/{market}/scan"
    payload = {
        "columns": cols,
        "filter": [{"left": "exchange", "operation": "in_range", "right": ["NSE"]}],
        "ignore_unknown_fields": False,
        "sort": {"sortBy": "market_cap_basic", "sortOrder": "desc"},
    }
    headers = {'Content-Type': 'text/plain'}
    r = requests.request("POST", url, headers=headers, data=json.dumps(payload))
    r.raise_for_status()
    return r.json()['data']

In [37]:
def download_scanner_data(market: str, columns: list[list[str]]):
    data: pd.DataFrame | None = None
    for idx, cols in enumerate(columns):
        symbols = fetch_scanner_data(market, cols)
        print(f"Loaded {((idx + 1) * 50)}")
        d = [[s['s']] + s['d'] for s in symbols]
        df = pd.DataFrame(d, columns=['ticker'] + cols)
        if data is None:
            data = df
        else:
            data = pd.merge(data, df, on="ticker", how="inner")
    return  data

In [38]:
def optimize_dataframe(df: pd.DataFrame, metadata: list[dict]) -> pd.DataFrame:
    """
    Optimize the dataframe column types based on provided metadata for efficient storage,
    ensuring compatibility with Parquet format and DuckDB.

    Args:
        df (pd.DataFrame): The input dataframe.
        metadata (list[dict]): List of column metadata with 'Column Name' and 'Type'.

    Returns:
        pd.DataFrame: Optimized dataframe.
    """
    type_mapping = {
        "text": "category",  # Use category for text as DuckDB treats it as enum
        "bool": "boolean",  # Optimized for boolean storage
        "fundamental_price": "float64",  # Higher precision for financial data
        "price": "float32",  # Optimized float for prices
        "number": "float64",  # Can store both int and float values efficiently
        "percent": "float32",  # Percentage values stored as floats
        "num_slice": "object",  # List of numbers, stored as object
        "time": "datetime64[ns]",  # Timestamp format
        "interface": "object",  # Keep JSON structures as object for DuckDB compatibility
        "time-yyyymmdd": "datetime64[ns]",  # Date format optimized
        "set": "object",  # Keep sets as object
        "map": "object"  # Keep dictionary key-value pairs as object
    }

    for column in metadata:
        col_name = column["Column Name"]
        col_type = column["Type"].lower()

        if col_name in df.columns:
            mapped_type = type_mapping.get(col_type, "object")

            if col_type == "text":
                df[col_name] = df[col_name].astype("category")  # Always use category for text
            elif col_type == "bool":
                df[col_name] = df[col_name].astype(mapped_type)
            elif col_type in ["fundamental_price", "number", "price", "percent"]:
                df[col_name] = pd.to_numeric(df[col_name], errors='coerce', downcast='float')
            elif col_type == "num_slice":
                df[col_name] = df[col_name].apply(lambda x: x if isinstance(x, list) else None)  # Use None instead of np.nan
            elif col_type in ["time", "time-yyyymmdd"]:
                date_format = '%Y%m%d' if col_type == "time-yyyymmdd" else None
                df[col_name] = pd.to_datetime(df[col_name], format=date_format, errors='coerce')
            elif col_type in ["interface", "map", "set"]:
                pass  # Keep as object, no serialization
            elif col_type == "integer":
                df[col_name] = df[col_name].astype(pd.Int64Dtype())  # Nullable integer type
            else:
                df[col_name] = df[col_name].astype(mapped_type)

    return df


In [39]:
columns = download_scanner_columns()

Saved scanner column info


In [24]:
chunk_size = 50
exclude_columns = ['index', 'index_id']
filter_cols: list[str] = [c['Column Name'] for c in columns if c['Column Name'] not in exclude_columns]
print(f"Column filtered {len(filter_cols)}")
columns_chunks = [filter_cols[i:i + chunk_size] for i in range(0, len(filter_cols), chunk_size)]
print(f"Column chunk {len(columns_chunks)}")

Column filtered 3342
Column chunk 67


In [42]:
df:pd.DataFrame = download_scanner_data('india', columns_chunks)

Loaded 50
Loaded 100
Loaded 150
Loaded 200
Loaded 250
Loaded 300
Loaded 350
Loaded 400
Loaded 450
Loaded 500
Loaded 550
Loaded 600
Loaded 650
Loaded 700
Loaded 750
Loaded 800
Loaded 850
Loaded 900
Loaded 950
Loaded 1000
Loaded 1050
Loaded 1100
Loaded 1150
Loaded 1200
Loaded 1250
Loaded 1300
Loaded 1350
Loaded 1400
Loaded 1450
Loaded 1500
Loaded 1550
Loaded 1600
Loaded 1650
Loaded 1700
Loaded 1750
Loaded 1800
Loaded 1850
Loaded 1900
Loaded 1950
Loaded 2000
Loaded 2050
Loaded 2100
Loaded 2150
Loaded 2200
Loaded 2250
Loaded 2300
Loaded 2350
Loaded 2400
Loaded 2450
Loaded 2500
Loaded 2550
Loaded 2600
Loaded 2650
Loaded 2700
Loaded 2750
Loaded 2800
Loaded 2850
Loaded 2900
Loaded 2950
Loaded 3000
Loaded 3050
Loaded 3100
Loaded 3150
Loaded 3200
Loaded 3250
Loaded 3300
Loaded 3350


In [43]:
optimize_dataframe(df,metadata=columns).to_parquet('../data/scanner/symbols.parquet', compression='zstd')

In [44]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2773 entries, 0 to 2772
Columns: 3343 entries, ticker to yield_upcoming
dtypes: boolean(4), category(62), datetime64[ns](51), float32(1705), float64(1476), object(45)
memory usage: 51.8+ MB
