# Efficient Frontier Project (Updated) — ECO 43000

**Student ID:** 23841777  
**Instructor:** John Droescher

This updated notebook adds a **legend** to the Efficient Frontier chart:
- Monte Carlo portfolios
- Max Sharpe
- Regression frontier

It otherwise reproduces the full analysis.


In [None]:
# Imports & configuration
import os, math
from datetime import datetime, timedelta
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

SEED = 42
np.random.seed(SEED)

OUTPUT_DIR = "/mnt/data/efficient_frontier_project"
os.makedirs(OUTPUT_DIR, exist_ok=True)

TICKERS = ["IBM", "JPM", "XOM", "WMT", "JNJ", "KO", "T"]
RISK_FREE_RATE = 0.04
END_DATE = datetime.today()
START_DATE = END_DATE - timedelta(days=365*2 + 5)


In [None]:
# Updated class with legend in the frontier plot
class EfficientFrontierProject:
    def __init__(self, tickers, start_date, end_date, rf=0.0, output_dir="."):
        self.tickers = tickers
        self.start_date = start_date
        self.end_date = end_date
        self.rf = rf
        self.output_dir = output_dir
        self.prices = None
        self.returns = None
        self.mu = None
        self.Sigma = None
        self.corr = None
        self.mc_results = None
        self.opt_weights = None
        self.opt_stats = None

    def fetch_prices(self):
        try:
            import yfinance as yf
            df = yf.download(self.tickers, start=self.start_date, end=self.end_date, progress=False)["Adj Close"]
            if isinstance(df, pd.Series):
                df = df.to_frame()
            missing = [t for t in self.tickers if t not in df.columns]
            if len(df) == 0 or len(missing) > 0:
                raise RuntimeError("Missing tickers or empty data")
            self.prices = df.dropna()
            source = "downloaded"
        except Exception:
            dates = pd.bdate_range(self.start_date, self.end_date)
            n = len(dates)
            k = len(self.tickers)
            annual_mu = np.array([0.08, 0.10, 0.11, 0.07, 0.06, 0.065, 0.055])
            annual_sigma = np.array([0.22, 0.24, 0.27, 0.18, 0.16, 0.17, 0.20])
            base_corr = 0.25
            corr_matrix = np.full((k, k), base_corr) + np.diag([1 - base_corr]*k)
            cov = np.outer(annual_sigma, annual_sigma) * corr_matrix
            dt = 1/252
            chol = np.linalg.cholesky(cov * dt)
            prices = np.zeros((n, k))
            prices[0, :] = 100.0
            for t in range(1, n):
                z = np.random.normal(size=k)
                dW = chol @ z
                drift = (annual_mu - 0.5*annual_sigma**2) * dt
                prices[t, :] = prices[t-1, :] * np.exp(drift + dW)
            self.prices = pd.DataFrame(prices, index=dates, columns=self.tickers)
            source = "simulated"
        return source

    def compute_statistics(self):
        ret = np.log(self.prices / self.prices.shift(1)).dropna()
        self.returns = ret
        self.mu = ret.mean() * 252
        self.Sigma = ret.cov() * 252
        self.corr = ret.corr()
        return self.mu, self.Sigma, self.corr

    def monte_carlo(self, n_portfolios=50000):
        n = len(self.tickers)
        means, stds, sharpes, weights_list = [], [], [], []
        Sigma = self.Sigma.values
        mu = self.mu.values
        for _ in range(n_portfolios):
            w = np.random.dirichlet(np.ones(n))
            port_mu = np.dot(w, mu)
            port_var = np.dot(w, Sigma @ w)
            port_std = np.sqrt(port_var)
            sharpe = (port_mu - self.rf) / port_std if port_std > 0 else -np.inf
            means.append(port_mu); stds.append(port_std); sharpes.append(sharpe); weights_list.append(w)
        df = pd.DataFrame({"Return": means, "Volatility": stds, "Sharpe": sharpes})
        for i, t in enumerate(self.tickers):
            df[t] = [w[i] for w in weights_list]
        self.mc_results = df
        idx = df["Sharpe"].idxmax()
        self.opt_weights = df.loc[idx, self.tickers].values
        self.opt_stats = {
            "Return": df.loc[idx, "Return"],
            "Volatility": df.loc[idx, "Volatility"],
            "Sharpe": df.loc[idx, "Sharpe"]
        }
        return self.mc_results, self.opt_weights, self.opt_stats

    def regression_frontier(self, bins=60, degree=2):
        df = self.mc_results.copy()
        vol = df["Volatility"].values.reshape(-1, 1)
        ret = df["Return"].values
        vmin, vmax = vol.min(), vol.max()
        edges = np.linspace(vmin, vmax, bins+1)
        centers = 0.5*(edges[:-1] + edges[1:])
        max_ret, valid_centers = [], []
        for i in range(bins):
            mask = (vol.flatten() >= edges[i]) & (vol.flatten() < edges[i+1])
            if mask.sum() > 0:
                max_ret.append(ret[mask].max())
                valid_centers.append(centers[i])
        X = np.array(valid_centers).reshape(-1, 1)
        y = np.array(max_ret)
        poly = PolynomialFeatures(degree=degree, include_bias=False)
        X_poly = poly.fit_transform(X)
        model = LinearRegression().fit(X_poly, y)
        vol_curve = np.linspace(vmin, vmax, 400).reshape(-1, 1)
        vol_curve_poly = poly.transform(vol_curve)
        ret_curve = model.predict(vol_curve_poly)
        reg_data = pd.DataFrame({"Volatility": vol_curve.flatten(), "Return": ret_curve})
        return model, reg_data

    def plot_correlation_matrix(self, save=True, filename="correlation_matrix_updated.png"):
        fig, ax = plt.subplots(figsize=(6, 5))
        cax = ax.imshow(self.corr.values, interpolation='nearest')
        ax.set_xticks(range(len(self.tickers)))
        ax.set_yticks(range(len(self.tickers)))
        ax.set_xticklabels(self.tickers, rotation=45, ha="right")
        ax.set_yticklabels(self.tickers)
        fig.colorbar(cax)
        ax.set_title("Correlation Matrix")
        fig.tight_layout()
        path = None
        if save:
            path = os.path.join(self.output_dir, filename)
            fig.savefig(path, dpi=200, bbox_inches="tight")
        plt.close(fig)
        return path

    def plot_frontier(self, reg_data=None, save=True, filename="efficient_frontier_updated.png"):
        fig, ax = plt.subplots(figsize=(7, 5))
        ax.scatter(self.mc_results["Volatility"], self.mc_results["Return"], s=3, alpha=0.4, label="Monte Carlo portfolios")
        ow = self.opt_stats
        ax.scatter([ow["Volatility"]], [ow["Return"]], marker="*", s=180, label="Max Sharpe")
        if reg_data is not None:
            ax.plot(reg_data["Volatility"], reg_data["Return"], linewidth=2, label="Regression frontier")
        ax.set_xlabel("σ (Volatility)")
        ax.set_ylabel("E(R) (Annual Return)")
        ax.set_title("Efficient Frontier (Monte Carlo + Regression)")
        ax.legend(loc="best")
        fig.tight_layout()
        path = None
        if save:
            path = os.path.join(self.output_dir, filename)
            fig.savefig(path, dpi=200, bbox_inches="tight")
        plt.close(fig)
        return path


In [None]:
# Executed updated pipeline and regenerate charts
proj = EfficientFrontierProject(TICKERS, START_DATE, END_DATE, rf=RISK_FREE_RATE, output_dir=OUTPUT_DIR)
src = proj.fetch_prices()
mu, Sigma, corr = proj.compute_statistics()
mc_df, opt_w, opt_stats = proj.monte_carlo(n_portfolios=50000)
model, reg_data = proj.regression_frontier(bins=60, degree=2)

cm_path = proj.plot_correlation_matrix(save=True, filename="correlation_matrix_updated.png")
ef_path = proj.plot_frontier(reg_data=reg_data, save=True, filename="efficient_frontier_updated.png")

cm_path, ef_path, src, opt_stats
