In [1]:
import numpy as np
import pandas as pd

pd.set_option("display.max_columns", 120)
START = "2022-01-01"
END = "2025-12-18"

In [2]:
# --- Yahoo Finance via yfinance: US tickers (real market yh_df) ---
tickers = ["SPY", "QQQ", "TLT", "GLD", "EEM"]
# SPY : S&P 500 index
# QQQ : Nasdaq-100 index
# TLT : U.S. Treasury bonds with 20+ year maturity
# GLD : Physical gold prices
# EEM : MSCI Emerging Markets index

try:
    import yfinance as yf
except Exception as e:
    yf = None
    print("Could not import yfinance:", type(e).__name__, str(e))

if yf is not None:
    try:
        yh_df = yf.download(tickers, start=START, end=END, auto_adjust=True, progress=False)
    except Exception as e:
        yh_df = pd.DataFrame()
        print("yfinance download failed:", type(e).__name__, str(e))
else:
    yh_df = pd.DataFrame()

# Convert to long format: date, ticker, close, volume
if isinstance(yh_df, pd.DataFrame) and yh_df.shape[0] > 0:
    if isinstance(yh_df.columns, pd.MultiIndex):
        close = yh_df["Close"].copy()
        vol = yh_df["Volume"].copy()
    else:
        close = yh_df[["Close"]].rename(columns={"Close": tickers[0]})
        vol = yh_df[["Volume"]].rename(columns={"Volume": tickers[0]})

    close.index.name = "date"
    vol.index.name = "date"

    us_close_long = close.reset_index().melt(id_vars="date", var_name="ticker", value_name="close")
    us_vol_long = vol.reset_index().melt(id_vars="date", var_name="ticker", value_name="volume")
    us_mkt = us_close_long.merge(us_vol_long, on=["date","ticker"], how="inner").dropna(subset=["close"])
else:
    us_mkt = pd.DataFrame(columns=["date","ticker","close","volume"])

us_mkt.head(), us_mkt.shape

(        date ticker      close    volume
 0 2022-01-03    EEM  44.624962  27572700
 1 2022-01-04    EEM  44.470776  24579500
 2 2022-01-05    EEM  43.745171  46425100
 3 2022-01-06    EEM  43.944710  34288700
 4 2022-01-07    EEM  44.343788  32640900,
 (4970, 4))

### 3.2.2 From NumPy array to Series 

Using `us_mkt`:

1. Filter to `ticker == "SPY"`.
2. Take `close` as a NumPy array.
3. Create a Series indexed by `date` named `SPY_close_series`.
4. Compute the mean/min/max with Series methods.

### 3.2.3 From Dictionary to Series 

Using `us_mkt`:

1. Compute the **last available close** for each ticker in `tickers`.
2. Store it in a dict `{ticker: last_close}`.
3. Convert to a Series and sort descending.

In [13]:
#1
df_sorted = us_mkt.sort_values(["ticker", "date"])
last_close_by_ticker = df_sorted.groupby("ticker")["close"].last()
last_close_by_ticker

ticker
EEM     52.599998
GLD    399.290009
QQQ    599.637390
SPY    669.421936
TLT     87.459633
Name: close, dtype: float64

In [14]:
#2
last_close_by_ticker_dict = last_close_by_ticker.to_dict()
last_close_by_ticker_dict

{'EEM': 52.599998474121094,
 'GLD': 399.2900085449219,
 'QQQ': 599.6373901367188,
 'SPY': 669.4219360351562,
 'TLT': 87.45963287353516}

In [18]:
#3
last_close_by_ticker_series = pd.Series(data=last_close_by_ticker_dict).sort_values(ascending=False)
last_close_by_ticker_series

SPY    669.421936
QQQ    599.637390
GLD    399.290009
TLT     87.459633
EEM     52.599998
dtype: float64

### 3.2.4 Series vs NumPy 

Goal: show why pandas alignment matters.

1. Create two Series indexed by date:
   - df mid-rate from `df`
   - SPY close from `us_mkt`
2. Combine them into a yh_dfFrame (pandas aligns on dates).
3. Separately, build two NumPy arrays by truncating to the same length.
4. In markdown: explain why alignment is safer.


### 3.3.6 Dealing with Nulls 
Using `us_mkt`:

1. Copy `us_mkt` to `us_mkt_nan`.
2. Set 1% of `close` to NaN (fixed random seed).
3. Create:
   - `us_drop`: drop NaNs
   - `us_fill`: fill NaNs with ticker-specific median close
4. Compare shapes.

### 3.3.7 Duplicates 

1. Create `dup_df` by stacking the last 5 rows of `us_mkt` twice.
2. Detect duplicates using `.duplicated()`.
3. Remove them using `.drop_duplicates()`.

In [43]:
#1
last5 = us_mkt.tail(5)
dup_df = pd.concat([last5, last5], axis = 0, ignore_index = True)
dup_df

Unnamed: 0,date,ticker,close,volume
0,2025-12-11,TLT,87.848114,26778700
1,2025-12-12,TLT,87.001404,47030100
2,2025-12-15,TLT,87.06118,28611800
3,2025-12-16,TLT,87.539314,41018700
4,2025-12-17,TLT,87.459633,24668300
5,2025-12-11,TLT,87.848114,26778700
6,2025-12-12,TLT,87.001404,47030100
7,2025-12-15,TLT,87.06118,28611800
8,2025-12-16,TLT,87.539314,41018700
9,2025-12-17,TLT,87.459633,24668300


In [45]:
#2
detected_duplicates = dup_df.duplicated()
detected_duplicates

0    False
1    False
2    False
3    False
4    False
5     True
6     True
7     True
8     True
9     True
dtype: bool

In [48]:
#3
drop_dup_df = dup_df.drop_duplicates()
drop_dup_df

Unnamed: 0,date,ticker,close,volume
0,2025-12-11,TLT,87.848114,26778700
1,2025-12-12,TLT,87.001404,47030100
2,2025-12-15,TLT,87.06118,28611800
3,2025-12-16,TLT,87.539314,41018700
4,2025-12-17,TLT,87.459633,24668300


### 3.3.8 Groupby 


Using `us_mkt`:

1. Group by `ticker` and compute:
   - mean close
   - median close
   - max volume
2. Rename columns clearly.
3. Sort by mean close descending.

In [52]:
#1
grp = (us_mkt.groupby("ticker").agg(
        mean_close=("close", "mean"),
        median_close=("close", "median"),
        max_volume=("volume", "max"),))
grp

Unnamed: 0_level_0,mean_close,median_close,max_volume
ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
EEM,40.46235,39.107407,134225700
GLD,220.130422,187.864998,62025000
QQQ,411.276967,400.214508,198685800
SPY,485.608585,460.740005,256611400
TLT,91.395622,88.549698,131353500


In [53]:
#2
grp_renamed = grp.rename(columns={
    'mean_close': 'mean close',
    'median_close': 'median close',
    'max_volume': 'highest volume'
})
grp_renamed

Unnamed: 0_level_0,mean close,median close,highest volume
ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
EEM,40.46235,39.107407,134225700
GLD,220.130422,187.864998,62025000
QQQ,411.276967,400.214508,198685800
SPY,485.608585,460.740005,256611400
TLT,91.395622,88.549698,131353500


In [56]:
#3
grp_renamed_sorted = grp_renamed.sort_values("mean close", ascending=False)
grp_renamed_sorted

Unnamed: 0_level_0,mean close,median close,highest volume
ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
SPY,485.608585,460.740005,256611400
QQQ,411.276967,400.214508,198685800
GLD,220.130422,187.864998,62025000
TLT,91.395622,88.549698,131353500
EEM,40.46235,39.107407,134225700


### 3.3.9 Reshape 

1. Create a 1-row wide yh_dfFrame with last closes per ticker.
2. Convert it to long format with `melt()` into columns: `ticker`, `last_close`.
3. Pivot `us_mkt` into a wide table: index=`date`, columns=`ticker`, values=`close` (keep first 50 dates).

In [65]:
#1
last_close_by_ticker

ticker
EEM     52.599998
GLD    399.290009
QQQ    599.637390
SPY    669.421936
TLT     87.459633
Name: close, dtype: float64

In [66]:
#1
yh_dfFrame = last_close_by_ticker.to_frame().T
yh_dfFrame

ticker,EEM,GLD,QQQ,SPY,TLT
close,52.599998,399.290009,599.63739,669.421936,87.459633


In [67]:
#2
yh_long = yh_dfFrame.melt(var_name="ticker", value_name="last_close")
yh_long

Unnamed: 0,ticker,last_close
0,EEM,52.599998
1,GLD,399.290009
2,QQQ,599.63739
3,SPY,669.421936
4,TLT,87.459633


In [71]:
#3
first_50_dates = np.sort(us_mkt["date"].unique())[:50]
us_mkt_first_50 = us_mkt[us_mkt["date"].isin(first_50_dates)]
us_mkt_wide = us_mkt_first_50.pivot(index="date", columns="ticker", values="close")
us_mkt_wide

ticker,EEM,GLD,QQQ,SPY,TLT
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2022-01-03,44.624962,168.330002,391.679443,451.875183,125.29538
2022-01-04,44.470776,169.570007,386.599213,451.723755,124.774368
2022-01-05,43.745171,169.059998,374.722412,443.049652,124.097153
2022-01-06,43.94471,166.990005,374.459106,442.633545,124.418449
2022-01-07,44.343788,167.75,370.402679,440.883575,123.524033
2022-01-10,44.343788,168.259995,370.646515,440.334991,123.828003
2022-01-11,45.368721,170.289993,376.214355,444.345612,124.652832
2022-01-12,46.121536,170.740005,377.706238,445.546967,124.17524
2022-01-13,45.468487,170.160004,368.257568,439.407898,125.278008
2022-01-14,45.450348,169.669998,370.549011,439.587646,123.385086
