In [25]:
from imports import pd, time
import numpy as np
import yfinance as source

In [35]:
# URLs for the NASDAQ Trader Symbol Directory files
# List is updated everyday.
nasdaq_url = "ftp://ftp.nasdaqtrader.com/SymbolDirectory/nasdaqlisted.txt"
# other_url = "ftp://ftp.nasdaqtrader.com/SymbolDirectory/otherlisted.txt"

# Load NASDAQ-listed stocks
nasdaq_data = pd.read_csv(nasdaq_url, sep="|")
# Remove the last row (footer info)
nasdaq_data = nasdaq_data[:-1]

# # Load other-listed stocks (including NYSE, AMEX, etc.)
# other_data = pd.read_csv(other_url, sep="|")
# # Filter for NYSE stocks
# nyse_stocks = other_data[other_data["Exchange"] == "N"]

In [None]:
#  Filter for companies that are not bankrupt, delisted, etc.
normal_cmps = nasdaq_data.where(nasdaq_data["Financial Status"] == "N").dropna()
normal_cmps.index = range(len(normal_cmps))
normal_cmps

In [None]:
nasdaq_tickers = normal_cmps[["Symbol", "Security Name"]]
nasdaq_tickers.loc[:, "Exchange"] = "NASDAQ"
nasdaq_tickers

In [41]:
data_points = ['symbol', 'shortName', 'currency', 'previousClose', 'open', 'dayLow', 'dayHigh', 'regularMarketPreviousClose',
               'regularMarketOpen', 'regularMarketDayLow', 'regularMarketDayHigh', 'dividendRate', 'dividendYield',
              'fiveYearAvgDividendYield', 'beta', 'trailingPE', 'forwardPE', 'volume',  'marketCap',
              'fiftyTwoWeekLow', 'fiftyTwoWeekHigh', 'priceToSalesTrailing12Months', 'fiftyDayAverage', 
              'twoHundredDayAverage', 'profitMargins', 'bookValue', 'priceToBook', 'earningsQuarterlyGrowth', 
              'netIncomeToCommon', 'trailingEps', 'forwardEps', 'enterpriseToRevenue', 'enterpriseToEbitda', '52WeekChange',
              'ebitda', 'totalDebt', 'quickRatio', 'currentRatio', 'totalRevenue', 'debtToEquity', 'revenuePerShare', 'returnOnAssets', 
              'returnOnEquity', 'grossProfits', 'freeCashflow', 'operatingCashflow', 'earningsGrowth', 'revenueGrowth', 'grossMargins', 
              'ebitdaMargins', 'operatingMargins', 'trailingPegRatio']

data = []
def download_data(stocks):
    stk = source.Tickers(" ".join(stocks))
    for ticker in stocks:
        try:
            stock_info = stk.tickers[ticker].info
            row = {key: stock_info.get(key, None) for key in data_points}
            data.append(row)
        except Exception as e:
            print(f"Error fetching data for {ticker}: {e}")


In [None]:
batch_size = 25
m = len(list(nasdaq_tickers["Symbol"]))

# Running below commented lines will fetch data for all the stocks in the NASDAQ list and uncomment if fetching
# data for first time.
for idx in range(0, m, batch_size):
    download_data(list(nasdaq_tickers["Symbol"][idx:idx+batch_size]))
    time.sleep(5)
print(f"Done fetching data for {len(data)} stocks.")

In [None]:
stocksData = [i for i in data if i!=None]
print(f"{len(stocksData)} stocks data fetched!")


***Now, we have to standarize the data for some of the columns and remove duplicates***

In [None]:
stocksData_df = pd.DataFrame(stocksData, columns=data_points)
stocksData_df = stocksData_df.dropna(subset=["symbol"])
stocksData_df = stocksData_df.replace([np.nan], 0.0)
stocksData_df.columns = [i.capitalize() for i in stocksData_df.columns]
stocksData_df['Marketcap'] = stocksData_df['Marketcap']/(10e8) # Converting marketcap in order of 100 million
stocksData_df['Returnonequity'] = stocksData_df['Returnonequity']*100 # Converting to percentage
stocksData_df['Earningsgrowth'] = stocksData_df['Earningsgrowth']*100 # Converting to percentage
stocksData_df['Revenuegrowth'] = stocksData_df['Revenuegrowth']*100 # Converting to percentage
stocksData_df['Profitmargins'] = stocksData_df['Profitmargins']*100 # Converting to percentage
stocksData_df['Ebitdamargins'] = stocksData_df['Ebitdamargins']*100 # Converting to percentage
stocksData_df['Totalrevenue'] = stocksData_df['Totalrevenue']/(10e8) # Converting total revenue in order of 100 million
stocksData_df['Grossprofits'] = stocksData_df['Grossprofits']/(10e8) # Converting gross profits in order of 100 million
stocksData_df['Freecashflow'] = stocksData_df['Freecashflow']/(10e8) # Converting free cash flow in order of 100 million
stocksData_df['Operatingcashflow'] = stocksData_df['Operatingcashflow']/(10e8) # Converting free cash flow in order of 100 million
stocksData_df = stocksData_df[stocksData_df['Currency']!='0.0']
stocksData_df = stocksData_df.round(2)
# Drop duplicates based on the Symbol column, keeping the first occurrence
stocksData_df = stocksData_df.drop_duplicates(subset="Symbol", keep="first")
stocksData_df.reset_index(drop=True, inplace=True)

stocksData_df.to_csv("stocksData.csv", index=False)
print("Saved data to csv file.")

In [None]:
df = pd.read_csv("stocksData.csv")
df.head(n=10)

In [None]:
for col in df.columns:
    print("Column:", col, df[col].idxmax(), df.loc[df[col].idxmax()]['Symbol'], df[col].idxmin(), df.loc[df[col].idxmin()]['Symbol'])

In [None]:
df.to_csv("stocksData.csv", index=False)
print("Saved data to csv file.")