In [None]:
import os
import numpy as np
import pandas as pd

# === Path and ticker mapping ===
predicted_data_folder = r"Pred_Data"

ticker_to_basename = {
    "^NSEI": "NIFTY_50",
    "^NSEBANK": "NIFTY_BANK",
    "^CNXIT": "NIFTY_IT",
    "^CNXPHARMA": "NIFTY_PHARMA",
    "^CNXFMCG": "NIFTY_FMCG",
    "^CNXAUTO": "NIFTY_AUTO",
    "^CNXMETAL": "NIFTY_METAL",
    "^CNXREALTY": "NIFTY_REALTY",
    "^CNXENERGY": "NIFTY_ENERGY",
    "NIFTY_FIN_SERVICE.NS": "NIFTY_FIN_SERVICE",
    "RELIANCE.NS": "RELIANCE_INDUSTRIES_LTD",
    "TCS.NS": "TATA_CONSULTANCY_SERV_LT",
    "SUNPHARMA.NS": "SUN_PHARMACEUTICAL_IND_L",
    "ICICIBANK.NS": "ICICI_BANK_LTD.",
    "INFY.NS": "INFOSYS_LIMITED",
    "SBIN.NS": "STATE_BANK_OF_INDIA",
    "BHARTIARTL.NS": "BHARTI_AIRTEL_LIMITED",
    "ITC.NS": "ITC_LTD",
    "LT.NS": "LARSEN_&_TOUBRO_LTD.",
    "HINDUNILVR.NS": "HINDUSTAN_UNILEVER_LTD."
}

In [12]:
def simulate_investment(df, amt, start_date, end_date, plot=False):
    df = df.copy()

    if 'Datetime' in df.index.names:
        df = df.reset_index()

    if 'Datetime' not in df.columns:
        raise ValueError("No 'Datetime' column found after reset_index.")

    df["Datetime"] = pd.to_datetime(df["Datetime"])
    df = df.sort_values("Datetime").reset_index(drop=True)

    # ---- Filter to start/end range ----
    actual_start = df[df["Datetime"] >= pd.to_datetime(start_date)]
    if actual_start.empty:
        raise ValueError("No data after the given start_date")
    start_row = actual_start.iloc[0]

    actual_end = df[df["Datetime"] <= pd.to_datetime(end_date)]
    if actual_end.empty:
        raise ValueError("No data before the given end_date")
    end_row = actual_end.iloc[-1]

    df = df[(df["Datetime"] >= start_row["Datetime"]) & (df["Datetime"] <= end_row["Datetime"])].reset_index(drop=True)

    # ---- Compute predicted and actual returns ----
    df["pred_return"] = df["PredictedPrice"].pct_change()
    df["actual_return"] = df["ActualPrice"].pct_change()

    invested = False
    cash = amt
    values = [cash]  # track value over time

    for i in range(1, len(df)):
        signal = df.loc[i, "pred_return"]
        actual_r = df.loc[i, "actual_return"]

        # If model predicts profit
        if signal > 0:
            if not invested:
                invested = True
            cash *= (1 + actual_r)
        else:
            if invested:
                invested = False
        values.append(cash)

    df["InvestmentValue"] = values

    final_value = df["InvestmentValue"].iloc[-1]
    profit_pct = ((final_value - amt) / amt) * 100

    summary = {
        "Initial Date Given": start_date,
        "Actual date of investment": str(start_row["Datetime"].date()),
        "Final Date Given": end_date,
        "Actual date of withdraw": str(end_row["Datetime"].date()),
        "Initial Amount": amt,
        "Final Amount": round(final_value, 2),
        "Return (%)": round(profit_pct, 2)
    }

    return summary


In [21]:
actual_returns = {}
expected_returns = {}

start_date = "2025-10-01"
end_date = "2025-11-06"

for ticker, basename in ticker_to_basename.items():
    file_path = os.path.join(predicted_data_folder, f"{basename}_predictions_xgboost.csv")
    if not os.path.exists(file_path):
        print(f"‚ö†Ô∏è Missing: {basename}")
        continue

    df = pd.read_csv(file_path, parse_dates=["Datetime"])
    df = df.replace([-np.inf, np.inf], np.nan).dropna(subset=["ActualPrice", "PredictedPrice"])

    if len(df) <= 300:
        print(f"‚ö†Ô∏è Skipping {basename} ‚Äî insufficient valid rows.")
        continue
    df = df.iloc[300:].copy()

    df["Actual_Return"] = df["ActualPrice"].pct_change()
    actual_returns[basename] = df.set_index("Datetime")["Actual_Return"].dropna()

    # Expected return via simulation
    try:
        sim = simulate_investment(df, amt=100000, start_date=start_date, end_date=end_date, plot=False)
        exp_ret = sim["Return (%)"] / 100.0
        expected_returns[basename] = exp_ret
        print(f"{basename}: Expected Return {exp_ret:.3f}")
    except Exception as e:
        print(f"Simulation failed for {basename}: {e}")

# Convert to DataFrames
actual_df = pd.concat(actual_returns, axis=1).dropna()
cov_matrix = actual_df.cov()
expected_return_series = pd.Series(expected_returns)


NIFTY_50: Expected Return 0.038
NIFTY_BANK: Expected Return 0.041
NIFTY_IT: Expected Return 0.057
NIFTY_PHARMA: Expected Return 0.018
NIFTY_FMCG: Expected Return 0.078
NIFTY_AUTO: Expected Return 0.008
NIFTY_METAL: Expected Return 0.051
NIFTY_REALTY: Expected Return 0.045
NIFTY_ENERGY: Expected Return 0.037
NIFTY_FIN_SERVICE: Expected Return 0.046
RELIANCE_INDUSTRIES_LTD: Expected Return 0.067
TATA_CONSULTANCY_SERV_LT: Expected Return 0.027
HDFC_BANK_LTD: Expected Return -0.003
ICICI_BANK_LTD.: Expected Return -0.002
INFOSYS_LIMITED: Expected Return 0.076
STATE_BANK_OF_INDIA: Expected Return 0.082
BHARTI_AIRTEL_LIMITED: Expected Return 0.092
ITC_LTD: Expected Return 0.064
LARSEN_&_TOUBRO_LTD.: Expected Return -0.002
HINDUSTAN_UNILEVER_LTD.: Expected Return 0.036


In [22]:
print("\nCovariance Matrix (from Actual Prices):")
cov_matrix.round(6)


Covariance Matrix (from Actual Prices):


Unnamed: 0,NIFTY_50,NIFTY_BANK,NIFTY_IT,NIFTY_PHARMA,NIFTY_FMCG,NIFTY_AUTO,NIFTY_METAL,NIFTY_REALTY,NIFTY_ENERGY,NIFTY_FIN_SERVICE,RELIANCE_INDUSTRIES_LTD,TATA_CONSULTANCY_SERV_LT,HDFC_BANK_LTD,ICICI_BANK_LTD.,INFOSYS_LIMITED,STATE_BANK_OF_INDIA,BHARTI_AIRTEL_LIMITED,ITC_LTD,LARSEN_&_TOUBRO_LTD.,HINDUSTAN_UNILEVER_LTD.
NIFTY_50,9e-06,9e-06,9e-06,6e-06,5e-06,1e-05,1.2e-05,1.1e-05,1.2e-05,9e-06,1.1e-05,7e-06,7e-06,9e-06,9e-06,1.1e-05,8e-06,6e-06,1.2e-05,4e-06
NIFTY_BANK,9e-06,1.2e-05,6e-06,5e-06,4e-06,9e-06,1.1e-05,1.2e-05,1.1e-05,1.2e-05,9e-06,5e-06,1.1e-05,1.2e-05,6e-06,1.4e-05,7e-06,4e-06,1e-05,2e-06
NIFTY_IT,9e-06,6e-06,2.3e-05,6e-06,5e-06,9e-06,1.1e-05,9e-06,9e-06,7e-06,8e-06,1.9e-05,3e-06,7e-06,2.4e-05,7e-06,7e-06,5e-06,9e-06,3e-06
NIFTY_PHARMA,6e-06,5e-06,6e-06,1.4e-05,4e-06,8e-06,1.1e-05,9e-06,9e-06,5e-06,7e-06,4e-06,3e-06,4e-06,5e-06,6e-06,5e-06,3e-06,8e-06,3e-06
NIFTY_FMCG,5e-06,4e-06,5e-06,4e-06,1.1e-05,6e-06,5e-06,5e-06,5e-06,4e-06,6e-06,4e-06,3e-06,3e-06,5e-06,4e-06,4e-06,1.1e-05,4e-06,1.2e-05
NIFTY_AUTO,1e-05,9e-06,9e-06,8e-06,6e-06,2e-05,1.6e-05,1.5e-05,1.4e-05,9e-06,1.1e-05,7e-06,5e-06,8e-06,8e-06,1.1e-05,9e-06,6e-06,1.2e-05,5e-06
NIFTY_METAL,1.2e-05,1.1e-05,1.1e-05,1.1e-05,5e-06,1.6e-05,3.1e-05,2e-05,2.1e-05,1.1e-05,1.4e-05,7e-06,7e-06,9e-06,9e-06,1.7e-05,9e-06,5e-06,1.7e-05,2e-06
NIFTY_REALTY,1.1e-05,1.2e-05,9e-06,9e-06,5e-06,1.5e-05,2e-05,3.8e-05,1.9e-05,1.2e-05,1.3e-05,6e-06,1e-05,1.1e-05,7e-06,1.6e-05,9e-06,5e-06,1.7e-05,2e-06
NIFTY_ENERGY,1.2e-05,1.1e-05,9e-06,9e-06,5e-06,1.4e-05,2.1e-05,1.9e-05,2.4e-05,1.1e-05,1.6e-05,6e-06,6e-06,9e-06,7e-06,1.7e-05,8e-06,5e-06,1.8e-05,2e-06
NIFTY_FIN_SERVICE,9e-06,1.2e-05,7e-06,5e-06,4e-06,9e-06,1.1e-05,1.2e-05,1.1e-05,1.2e-05,9e-06,5e-06,1.2e-05,1.2e-05,6e-06,1.3e-05,7e-06,4e-06,1e-05,2e-06


## üß≠ **Efficient-Frontier Classical Mean‚ÄìVariance**

**Goal:**
Maximize
$$
\text{Sharpe ratio} = \frac{E[R_p] - R_f}{\sigma_p}
$$
using covariance of returns and expected returns.

**Pros:**

* Simple, interpretable, mathematically elegant
* Works well with short horizons if data is stable

**Cons:**

* Covariance estimation is noisy intraday
* Assumes normally distributed returns (not always true)

‚úÖ **Simple and Statistically proven**



## ‚ö° **Exponentially Weighted Covariance / EWMA**

Instead of using equal-weighted rolling windows (past *k* days), use **exponential weighting**:
$$
\sigma^2_t = \lambda \sigma^2_{t-1} + (1 - \lambda) r_t^2
$$

* More recent data ‚Üí heavier weight
* Good for **intraday / high-frequency** data

**Implementation:** replace `.cov()` with exponentially weighted covariance.

‚úÖ Reacts faster to new market volatility.



## üîÆ **Bayesian Portfolio Optimization (Black‚ÄìLitterman Model)**

**Idea:** combine your model‚Äôs ‚Äúviews‚Äù (like XGBoost expected returns) with market equilibrium returns.

* **Priors** = market-implied returns
* **Posterior** = blend of priors + your predicted view
* Produces **more stable weights**

**Formula:**
$$
\mu_{BL} = [(œÑŒ£)^{-1} + P^TŒ©^{-1}P]^{-1}[(œÑŒ£)^{-1}Œ† + P^TŒ©^{-1}Q]
$$

**Tools:** `pyportfolioopt` or custom Bayesian posterior update.

‚úÖ Handles estimation error much better ‚Äî excellent for short-term noisy signals.



## ü§ñ **Reinforcement Learning (Dynamic Portfolio Allocation)**

Treat it like an **agent** problem:

* **State:** recent price movements, volatility, indicators
* **Action:** allocate weights among assets
* **Reward:** portfolio return (risk-adjusted)

Common algorithms:

* Deep Q-Learning
* PPO (Proximal Policy Optimization)
* DDPG

**Libraries:** `FinRL`, `stable-baselines3`, `TensorTrade`

‚úÖ Learns patterns automatically.
üö´ Needs heavy data + tuning, but great for intraday.



## üß± **Hierarchical Risk Parity (HRP)** ‚Äî *Machine Learning‚ÄìBased Portfolio Optimization*

**Idea:**
HRP applies **unsupervised machine learning (hierarchical clustering)** to group assets by correlation structure
and then allocates risk in a **bottom-up, balanced** way ‚Äî without inverting the covariance matrix.

Instead of directly solving for optimal weights via matrix algebra,
HRP **learns the structure** of the market using cluster trees ‚Äî a core concept from **machine learning**.



### üß© **Steps:**

1. Compute correlation matrix of asset returns.
2. Perform **unsupervised learning** ‚Äî specifically **hierarchical clustering algorithms** (from ML) (e.g., single-linkage, Ward, or average linkage).
3. Reorder covariance matrix based on cluster tree (dendrogram).
4. Allocate portfolio weights recursively to achieve **equal risk contribution** within clusters.

### ‚úÖ **Pros:**

* Robust to noisy covariance estimates
* Naturally diversifies correlated assets
* Scales well to large portfolios
* Machine-learning interpretability (cluster explainability)

### ‚ö†Ô∏è **Cons:**

* No explicit Sharpe ratio optimization
* Cluster linkage choice affects results
* Computationally heavier for high-frequency updates


## üß© **Summary Table**

| Method                             | Uses                                                             | Pros                                        | Cons                                      |
| ---------------------------------- | ---------------------------------------------------------------- | ------------------------------------------- | ----------------------------------------- |
| **Mean‚ÄìVariance (Classical)**      | Covariance-based optimization                                    | Simple, interpretable                       | Assumes normal returns, unstable intraday |
| **EWMA**                           | Dynamic covariance                                               | Captures volatility shifts                  | Needs Œª tuning                            |
| **Black‚ÄìLitterman**                | Bayesian mix of priors + views                                   | Stable, interpretable                       | Complex math                              |
| **RL Agent**                       | Adaptive allocation                                              | Learns nonlinear patterns                   | Data hungry, hard to tune                 |
| **Hierarchical Risk Parity (HRP)** | *Machine learning (unsupervised clustering)* for risk allocation | Robust to noise, diversified, interpretable | No explicit Sharpe optimization           |


# What is Value at Risk (VaR)?

**Value at Risk (VaR)** estimates how much you can **lose** with a certain **confidence level** over a specific **time period**.

> ‚ÄúAt 95% confidence, the portfolio will not lose more than X% (or ‚ÇπX) in one period.‚Äù


## **Portfolio Return Formula**

For a portfolio with ( n ) assets:

$$ R_p = \sum_{i=1}^{n} w_i R_i
$$

where:

* $ R_p $ = portfolio return
* $ w_i $ = weight of asset $ i $ in portfolio
* $ R_i $ = return of asset $ i $

Expected (mean) portfolio return:

$$
\mu_p = \mathbf{w}^T \boldsymbol{\mu}
$$

Portfolio variance:

$$
\sigma_p^2 = \mathbf{w}^T \Sigma \mathbf{w}
$$

where $ \Sigma $ = covariance matrix of asset returns.

Portfolio standard deviation (volatility):

$$
\sigma_p = \sqrt{\mathbf{w}^T \Sigma \mathbf{w}}
$$



## **Parametric (Analytical) VaR Formula**

If portfolio returns are **normally distributed**,
VaR at confidence level ( \alpha ) (e.g., 95% or 99%) is:

$$
VaR_{\alpha} = z_{\alpha} , \sigma_p - \mu_p
$$

or equivalently (if you care only about losses):


Where:

* $ \mu_p $ = mean portfolio return
* $ \sigma_p $ = portfolio standard deviation
* $ z_{\alpha} $ = quantile from the standard normal distribution

  * For 95% ‚Üí $ z_{0.95} = 1.65 $
  * For 99% ‚Üí $ z_{0.99} = 2.33 $