In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import yfinance as yf

print("Libraries imported successfully")


Libraries imported successfully


In [2]:
#I'm creating files where outputs gonna be stocked
from pathlib import Path
# START/END : my project period
START = "2015-01-01"
END   = "2020-12-31"
# where the CSV data will be stocked
RAW_DIR = Path("../datasets/raw_data")
RAW_DIR.mkdir(parents=True, exist_ok=True)

print(f" file raw data : {RAW_DIR.resolve()}")


 file raw data : /files/Final Project DSAP/datasets/raw_data


In [5]:
#Download S&P500 and VIX daily
import numpy as np
import pandas as pd
import yfinance as yf

def download_series(ticker: str, start: str, end: str):
    df = yf.download(
        ticker, start=start, end=end, interval="1d",
        group_by="column", auto_adjust=False, progress=False, threads=False
    )
    if df.empty:
        raise ValueError(f" Données vides pour {ticker}. Vérifie la connexion ou les dates.")

    # We Want the Adjusted Close in our table because it is more accurate than juste Close
    if isinstance(df.columns, pd.MultiIndex):
        
        try:
            s = df.xs('Adj Close', axis=1, level=0).iloc[:, 0]
        except Exception:
            
            cols = [c for c in df.columns if 'Adj Close' in str(c)]
            if not cols:
                raise KeyError(f"'Adj Close' introuvable. Colonnes: {list(df.columns)}")
            s = df[cols[0]]
    else:
        
        if 'Adj Close' not in df.columns:
            raise KeyError(f"'Adj Close' introuvable. Colonnes: {list(df.columns)}")
        s = df['Adj Close']

    s = s.dropna()
    return s

# Download
sp500_price = download_series("^GSPC", START, END).rename("SP500")
vix_price   = download_series("^VIX",  START, END).rename("VIX")

# Return/volatility daily for S&P and VIX
market_daily = pd.concat([sp500_price, vix_price], axis=1).dropna()
market_daily["SP500_return_d"] = np.log(market_daily["SP500"] / market_daily["SP500"].shift(1))
market_daily["VIX_change_d"]   = np.log(market_daily["VIX"]   / market_daily["VIX"].shift(1))
market_daily = market_daily.dropna()

# Save
out_daily = RAW_DIR / "market_daily.csv"
market_daily.to_csv(out_daily)
print(f"✅ Enregistré : {out_daily} | lignes = {len(market_daily)}")

market_daily.head(3)


✅ Enregistré : ../datasets/raw_data/market_daily.csv | lignes = 1509


Unnamed: 0_level_0,SP500,VIX,SP500_return_d,VIX_change_d
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2015-01-05,2020.579956,19.92,-0.018447,0.113088
2015-01-06,2002.609985,21.120001,-0.008933,0.058496
2015-01-07,2025.900024,19.309999,0.011563,-0.089597


In [6]:
# Showing SP500 and VIX weekly (Friday closure)

sp500_w = sp500_price.resample("W-FRI").last().to_frame()
vix_w   = vix_price.resample("W-FRI").last().to_frame()
# Computing the log return ( log Pt/ Pt-1)
sp500_w["SP500_return_w"] = np.log(sp500_w["SP500"] / sp500_w["SP500"].shift(1))
vix_w["VIX_change_w"]     = np.log(vix_w["VIX"]     / vix_w["VIX"].shift(1))
# volatiliy rolling on 4 weeks ( mean variation on return over the last month)
sp500_w["SP500_vol_w_4"]  = sp500_w["SP500_return_w"].rolling(4).std()

market_weekly = sp500_w.join(vix_w[["VIX", "VIX_change_w"]], how="inner").dropna()

out_weekly = RAW_DIR / "market_weekly_W-FRI.csv"
market_weekly.to_csv(out_weekly)
print(f" Saved : {out_weekly} | lignes = {len(market_weekly)}")

market_weekly.tail(3)


 Saved : ../datasets/raw_data/market_weekly_W-FRI.csv | lignes = 310


Unnamed: 0_level_0,SP500,SP500_return_w,SP500_vol_w_4,VIX,VIX_change_w
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-12-18,3709.409912,0.012465,0.014038,21.57,-0.077579
2020-12-25,3703.060059,-0.001713,0.012232,21.530001,-0.001856
2021-01-01,3732.040039,0.007795,0.009887,22.77,0.055997
