In [None]:
from pathlib import Path
import pandas as pd

In [None]:
# Make CSV for nasdeq
# === 1) locate dataset root ===

base_root = Path.home() / "csc1171" / "data" / "raw"
dataset_dir_name = "amex_nyse_nasdaq_stock_histories"

dataset_dir = None
if base_root.exists():
    for p in base_root.iterdir():
        if p.is_dir() and p.name.lower() == dataset_dir_name:
            dataset_dir = p
if dataset_dir is None:
    # fallback (may not exist on some systems; edit as needed)
    dataset_dir = base_root / dataset_dir_name

symbols_file = dataset_dir / "all_symbols.txt"
output_file = dataset_dir / "clean_company_listings.csv"

# === 2) load tickers ===
with open(symbols_file, "r", encoding="utf-8") as f:
    symbols = [ln.strip().lower() for ln in f if ln.strip() and not ln.startswith("#")]

# === 3) scan once: index all csv files by symbol (case-insensitive) ===
# if multiple files exist for a symbol, prefer paths that include amex/nasdaq/nyse
def score_path_for_exchange(p: Path):
    parts = [x.lower() for x in p.parts]
    bonus = 0
    for exch in ("amex", "nasdaq", "nyse"):
        if exch in parts:
            bonus += 10
    # lower score is better; negative bonus wins
    return (-bonus, len(p.parts))

symbol_to_path = {}
if dataset_dir.exists():
    csv_files = list(dataset_dir.rglob("*.csv"))
    csv_files.sort(key=score_path_for_exchange)
    for p in csv_files:
        sym = p.stem.lower()
        if sym not in symbol_to_path:
            symbol_to_path[sym] = p

# === 4) build a {symbol -> company name} map from likely metadata files ===
meta_patterns = [
    "*symbols_valid_meta*.csv",
    "*nasdaq_screener*.csv",
    "*companylist*.csv",
    "*symbols*.csv",
    "*companies*.csv",
    "*metadata*.csv",
]

name_col_candidates = [
    ("symbol", "name"),
    ("symbol", "security name"),
    ("symbol", "company name"),
    ("ticker", "name"),
    ("ticker", "security name"),
    ("ticker", "company name"),
]

symbol_to_name = {}

def try_read_csv_any_sep(path: Path):
    try:
        return pd.read_csv(path)
    except Exception:
        for sep in ("|", "\t", ";"):
            try:
                return pd.read_csv(path, sep=sep)
            except Exception:
                pass
    return None

for pattern in meta_patterns:
    for meta_path in dataset_dir.rglob(pattern):
        dfm = try_read_csv_any_sep(meta_path)
        if dfm is None or dfm.empty:
            continue
        cols_lc = {c.lower(): c for c in dfm.columns}
        matched = False
        for sym_col, name_col in name_col_candidates:
            if sym_col in cols_lc and name_col in cols_lc:
                sc, nc = cols_lc[sym_col], cols_lc[name_col]
                sub = dfm[[sc, nc]].dropna()
                for _, r in sub.iterrows():
                    s = str(r[sc]).strip().lower()
                    n = str(r[nc]).strip()
                    if s and n and s not in symbol_to_name:
                        symbol_to_name[s] = n
                matched = True
                break
        if matched:
            continue

# === 5) helpers for exchange, earliest date, and name-in-file ===
def infer_exchange_from_path(p: Path) -> str:
    parts = [x.lower() for x in p.parts]
    for exch in ("amex", "nasdaq", "nyse"):
        if exch in parts:
            return exch.upper()  # output text can be uppercase in data; rule is about code
    pdn = p.parent.name
    return pdn.upper() if pdn.lower() in ("amex", "nasdaq", "nyse") else ""

def earliest_date_from_csv(p: Path) -> str:
    try:
        head = try_read_csv_any_sep(p)
        if head is None or head.empty:
            return ""
        cols = [c.lower() for c in head.columns]
        date_candidates = ("date", "timestamp")
        dcol = None
        for c in head.columns:
            if c.lower() in date_candidates:
                dcol = c
                break
        if dcol is None:
            return ""
        df = try_read_csv_any_sep(p)
        if df is None or df.empty or dcol not in df.columns:
            return ""
        dt = pd.to_datetime(df[dcol], errors="coerce")
        return dt.min().date().isoformat() if dt.notna().any() else ""
    except Exception:
        return ""

def try_name_from_file(p: Path) -> str:
    try:
        df = try_read_csv_any_sep(p)
        if df is None or df.empty:
            return ""
        for c in df.columns:
            if c.lower() in ("name", "company", "security name", "company name"):
                s = df[c].dropna()
                return str(s.iloc[0]).strip() if len(s) else ""
    except Exception:
        pass
    return ""

# === 6) build the output table ===
rows = []
for sym in symbols:
    rec = {
        "company name": symbol_to_name.get(sym, ""),
        "symbol (acronym)": sym.upper(),  # output text can be uppercase
        "date of listing": "",
        "stock offering": "",
    }
    p = symbol_to_path.get(sym)
    if p is not None:
        rec["stock offering"] = infer_exchange_from_path(p)
        rec["date of listing"] = earliest_date_from_csv(p)
        if not rec["company name"]:
            nm = try_name_from_file(p)
            if nm:
                rec["company name"] = nm
    rows.append(rec)

df = pd.DataFrame(rows, columns=["company name", "symbol (acronym)", "date of listing", "stock offering"])
df.sort_values(by=["symbol (acronym)", "date of listing"], inplace=True, ignore_index=True)
df.to_csv(output_file, index=False)

print(f"wrote: {output_file}")
print(df.head(20))