# Modern Portfolio Theory

### What is Moden Portfolio Theory?

Modern portfolio theory is a practical method for selecting investments in order to maximize their overall returns within an acceptable level of risk. 
It was first pioneered by Harry Markowitz in 1952, and the main takeaway from his theory is the fact that risk to reward ratios of portfolios are not linearly related. That is, it doesn't have to be that way, and if fact the "efficient frontier" described by Markowitz shows that it is possible to find an optimal "mix" of assets that maximizes returns while minimizing risk.

In [3]:
# These are all the dependencies we're going to need to this project.

import os
import requests
import zipfile
import io
import pandas as pd
import numpy as np
from datetime import datetime
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import cvxpy as cp
from concurrent.futures import ThreadPoolExecutor
import yfinance as yf

'''
    Of these "non common" ones:
        - yfinance: Used to grab stock data.
        - ThreadPoolExecutor: Used to download files from Binance, without the downloads take 10+ minutes instead of seconds.
        - Plotly: A better alternative to matplotlib.
        - cvxpy: The optimizer algorithms.
'''

'\n    Of these "non common" ones:\n        - yfinance: Used to grab stock data.\n        - ThreadPoolExecutor: Used to download files from Binance, without the downloads take 10+ minutes instead of seconds.\n        - Plotly: A better alternative to matplotlib.\n        - cvxpy: The optimizer algorithms.\n'

In [None]:
# NOTE: The following two download blocks assume the existence of the directory "data" in the same current working directory. Errors were not handled.

# GET ALL CRYPTO DATA - Using futures saves a lot of time (a form of multithreading).

# List of the pairs of interest in the crypto market:
PAIRS = ['BTCUSDT', 'ETHUSDT', 'LTCUSDT', 'ADAUSDT', 'SOLUSDT', 'BNBUSDT', 'MATICUSDT', 'XRPUSDT', 'LINKUSDT', 'DOTUSDT']

# Builds the URL for the file we want to download:
def get_url(pair, interval, month, year):
    return f'https://data.binance.vision/data/spot/monthly/klines/{pair}/{interval}/{pair}-{interval}-{year}-{month:02}.zip'

# Function to download and process a single URL:
def download_and_process(url):
    HEADER = ['Date', 'open', 'high', 'low', 'Close', 'volume', 'close_time', 'quote_volume', '1', '2', '3', '4'] # We only need the Date and Close price.
    response = requests.get(url)
    response.raise_for_status()
    with zipfile.ZipFile(io.BytesIO(response.content)) as z: # Extract file in memory.
        for filename in z.namelist():
            with z.open(filename) as f: # Read file into a data frame.
                df = pd.read_csv(f, names=HEADER)
                df = df[['Date', 'Close']] # Fetch only the important columns
                df['Date'] = pd.to_datetime(df['Date'], unit='ms').dt.strftime('%Y-%m-%d')
                return df

# Downloads all files from a provided list of URLs and saves the concatenated DFs of the file into a single .csv file: 
def download_all(urls):
    filename = (urls[0].split('/')[-1]).split('-')[0]
    saveDir = './data'
    file_path = os.path.join(saveDir, filename+'.csv')
    dfs = [] 
    print(f'Downloading Data for {filename}')
    with ThreadPoolExecutor() as executor: # Execute each download as a thread.
        results = list(executor.map(download_and_process, urls))
    for result in results:
        if result is not None:
            dfs.append(result)
    
    # Concatenate and sort the DFs by date and save that file to saveDir:
    concatenated_df = pd.concat(dfs)
    sorted_df = concatenated_df.sort_values(by='Date')
    sorted_df = sorted_df.reset_index(drop=True)
    sorted_df.to_csv(file_path)
    print(f'Data for {filename} has been downloaded successfully.')

# Download all assets' daily data starting on starting date:
def get_data(assets, startDate=2021):
    interval = '1d'
    now = datetime.now()
    current_year = now.year
    current_month = now.month
    for asset in assets:
        urls = []
        for year in range(startDate, startDate+4): # Get up to 3 years worth of data. Approx 1000 days in the market.
            for month in range(1, 13):
                if year < current_year: # Assumption: All pairs selected HAVE data from 2021, and they won't have data beyond previous month.
                    urls.append(get_url(asset, interval, month, year))
        download_all(urls)

# Call the function to get the data:
get_data(PAIRS)

In [None]:
# Get all stock data to add uncorrelated assets to the portfolio:

tickers = ['AAPL', 'MSFT', 'META', 'GOOGL', 'NVDA', 'INTC'] 

# Define the date range
start_date = '2021-01-01'
end_date = '2023-12-31'
save_dir = './data' 

# Fetch the data and save to CSV files:
for ticker in tickers:
    data = yf.download(ticker, start=start_date, end=end_date)
    close_prices = data[['Close']]
    file_path = os.path.join(save_dir, f'{ticker}.csv')
    close_prices.to_csv(file_path)
    print(f'Saved data for {ticker} to {file_path}')

print("Data download complete.")

In [4]:
# Plotting Existing Data to Visuallize Market Behavior.

DATA = './data' # Same "saveDir" as before but in this block it's used for reading so makes sense to hightlight what it does.

# Inner joins the data so that we have a single DF with the close price of each asset:
def join_data(directory):
    dfs = []
    for filename in os.listdir(directory):
        if filename.endswith('.csv'):
            asset_name = filename.split('.')[0]
            filepath = os.path.join(directory, filename)
            df = pd.read_csv(filepath)
            if 'Date' in df.columns:
                df['Date'] = pd.to_datetime(df['Date'])
            df.rename(columns={'Close': asset_name.split('USDT')[0].strip()}, inplace=True)
            dfs.append(df[['Date', asset_name.split('USDT')[0].strip()]].copy())
    merged_df = None
    for df in dfs:
        if merged_df is None:
            merged_df = df
        else:
            merged_df = pd.merge(merged_df, df, on='Date', how='inner')
    merged_df.sort_values(by='Date', inplace=True)
    merged_df.reset_index(drop=True, inplace=True)
    return merged_df

# Print the resulting DF for sanity check:

joinedDF = join_data(DATA)

# Plot the data. To avoid unreadable plots, we'll divide the data by ranges of 10's, 100's ... up to 100,000's.
# This allows us to plot a subplot for assets with prices close to each other together and avoid a congested plot that is unreadable.

data = joinedDF

titles = list(data.keys())[1:]

# Initialize figure with subplots:
fig = make_subplots(rows=8, cols=2, subplot_titles=list(data.keys())[1:]) # Note the subplots are not dynamically adapted to the number of assets. It's fixed for our case - 16 assets.

# Add traces for each column:
for i, col in enumerate(titles):
    row_index = i // 2 + 1
    col_index = i % 2 + 1
    fig.add_trace(go.Scatter(x=data['Date'], y=data[col], name=col), row=row_index, col=col_index)

# Update layout for dark mode and clean theme:
fig.update_layout(
    title='Selected Asset Prices',
    xaxis_title='Date',
    yaxis_title='Price in USDT',
    template='plotly_dark',
    height=1600,  
    width=2200,
    title_x=0.5
)

# Lastly we modify the subplots axes titles and adjust to taste for the 'perfect' look.
for i in range(1, 9):
    for j in range(1, 3):
        fig.update_xaxes(title_text='Date', row=i, col=j, title_font=dict(size=12))
        fig.update_yaxes(title_text='Price in USDT', row=i, col=j, title_font=dict(size=12))

# Display the plots.
fig.show()

In [5]:
# We can alternatively plot the prices in logarithmic scale to normalize the prices and bring them closer together so they look better on the same plot.

data = joinedDF

fig = go.Figure()

for col in data.columns[1:]:  
    fig.add_trace(go.Scatter(x=data['Date'], y=data[col], name=col))

fig.update_layout(
    title='Price Trends for Cryptocurrencies - Logarithmic Scale',
    xaxis_title='Date',
    yaxis_title='Price',
    template='plotly_dark',
    height=1080,
    width=1920,
    title_x=0.5
    )

# Set y-axis to logarithmic scale:
fig.update_yaxes(type='log')

# Now we can plot all of these on a single plot:
fig.show()


In [6]:
# We can also plot each asset alone by directly accesing its file for debugging or comparisons.

def plot_close_prices(file_path):
    df = pd.read_csv(file_path)
    asset_name = file_path.split('/')[-1].split('.')[0]
    fig = go.Figure()
    trace = go.Scatter(x=df['Date'], y=df['Close'], mode='lines', name=f'{asset_name} Close')
    fig.add_trace(trace)
    fig.update_layout(
        title=f'{asset_name.split('USDT')[0]} Close Prices Over Time',
        xaxis_title='Date',
        yaxis_title='Close Price',
        template='plotly_dark',
        height=1080,
        width=1920,
    )
    fig.show()

# Display the plot for a single asset.
plot_close_prices('data/BTCUSDT.csv')

In [7]:
# Now we need to find the correlation of prices, and ideally plot a heat map, or a correlation matrix, of how everything correlates to one another.

prices = (data.copy()).drop(columns=['Date'])

# We compute the different correlations using different methods to say how they behave.
pearson_corr = prices.corr(method='pearson')
kendall_corr = prices.corr(method='kendall')
spearman_corr = prices.corr(method='spearman')

# Create subplots.:
fig = make_subplots(
    rows = 1, cols = 3,
    horizontal_spacing = 0.1
)

# Add Pearson heatmap:
fig.add_trace(
    go.Heatmap(
        z = pearson_corr.values[::-1],
        x = pearson_corr.columns[::-1],
        y = pearson_corr.columns,
        colorscale = 'Viridis',
        hoverongaps = False,
        colorbar = dict(title = "Pearson", x = 0.3, title_side = 'bottom')
    ),
    row = 1, col = 1
)

# Add Kendall heatmap:
fig.add_trace(
    go.Heatmap(
        z = kendall_corr.values[::-1],
        x = kendall_corr.columns[::-1],
        y = kendall_corr.columns,
        colorscale = 'Viridis',
        hoverongaps = False,
        colorbar = dict(title = "Kendall", x = 0.65, title_side = 'bottom')
    ),
    row = 1, col = 2
)

# Add Spearman heatmap:
fig.add_trace(
    go.Heatmap(
        z = spearman_corr.values[::-1],
        x = spearman_corr.columns[::-1],
        y = spearman_corr.columns,
        colorscale = 'Viridis',
        hoverongaps = False,
        colorbar = dict(title = "Spearman", x = 1.0, title_side = 'bottom')
    ),
    row = 1, col = 3
)

# Update layout:
fig.update_layout(
    title = 'Correlation Heatmap of Cryptocurrency Prices',
    template = 'plotly_dark',
    height = 640,
    width = 1920,
    title_x = 0.5,
    font = dict(
        size = 10 
    ),
    xaxis = dict(side = 'top'),
    xaxis2 = dict(side = 'top'),
    xaxis3 = dict(side ='top')
)

fig.show()


In [8]:
# Now we find the Covariances and plot the heat map similar to the correlations.

# Compute covariance matrix:
returns = prices.pct_change()
covariance_matrix = returns.cov()*312 # Adjusting the covariances for (252+365)/2 average trading days (stocks are for 252 days and crypto is 24/7 so 365 days).

# Create a figure for heatmap:
fig = go.Figure(data=go.Heatmap(
    z = covariance_matrix.values[::-1],
    x = covariance_matrix.columns[::-1],
    y = covariance_matrix.columns,
    colorscale='Viridis',
    hoverongaps=False
))
fig.update_layout(
    title = 'Covariance Heatmap of Cryptocurrency Prices',
    template = 'plotly_dark',
    height = 800,
    width = 800,
    title_x = 0.5,
    xaxis = dict(
        side = 'top'
        ),
    font = dict(
        size = 10
    )
)
# Show plot:
fig.show()

In [9]:
# We can also show how the changes look like, specially the daily returns in this case. They behave in a way that isn't human readable but perhaps this info is useful for machine learning applications.

returns = prices.pct_change()

fig = go.Figure()

returns['Date'] = data['Date'] # Adding back the date column for the x-axis plotting.

# Add traces for each column:
for col in returns.columns[1:]:
    fig.add_trace(go.Scatter(x=returns['Date'], y=returns[col], name=col))
fig.update_layout(
    title='Price Trends for Cryptocurrencies - Logarithmic Scale',
    xaxis_title='Date',
    yaxis_title='Price',
    template='plotly_dark',
    height=1080,
    width=1920,
    title_x=0.5
)

# Now we can plot all of these on a single plot:
fig.show()


# Optimization Step

Now we need to define the optimization problem.
In the standard "vanilla" Modern Portfolio Theory, the goal is to maximize the Sharpe ratio of the portfolio, or minimize the Portfolio Variance.

For the scope of our course, we're only concerned with Minimization.

### We define the problem as follows: 

Let $ n $ be the number of assets. Let $w \in \R^n$ be the weights vector, which represents the percentage to allocate for each asset.
Then the Objective Function we want to minimize is: 
$$ \min \big[  \ w^T \Sigma w  \big] \ $$
Where $\Sigma$ is the covariance matrix of assets returns and $w$ is the weights as described above.
The problem is subect to the following constraints: 

1. $\forall i\in [0,n]: \ 0\le w_i$.

2. $\sum_{i=0}^n w_i = 1$. 

In this project, we could have used the implementation of our HW2 or Scipy.
But for simplicity sake, and perhaps to avoid errors of perhaps a faulty implementation in HW2, we've decided to use CVXPY. It's simple and robust and is not as advanced as Scipy.


In [10]:
# We set up the optimization problem here: 

# First we need the expected returns: 
returns = prices.pct_change()
expected_returns = returns.mean()

annualized_returns = (1 + expected_returns)**312 -1

# Then we need the covariance matrix of the prices:
covariance_matrix = returns.cov()

# Sanity check: 
# expected_returns
# covariance_matrix

# And now we setup the convex optimizers of CVXPY:

w = cp.Variable(len(expected_returns)) # The variable we're optimizing over.

objective_function = cp.Minimize(cp.quad_form(w, covariance_matrix)) # The quadratic problem as formulated above.

constraints = [ cp.sum(w) == 1 , w >= 0 ] # The constraints as described above.

# Now we solve:

problem = cp.Problem(objective_function, constraints)
problem.solve(solver=cp.CLARABEL)

# Print results for debugging: 
optimal_weights = w.value
portfolio_variance = cp.quad_form(optimal_weights, covariance_matrix).value
portfolio_return = expected_returns.T @ optimal_weights

print("Optimal Weights: \n", optimal_weights)
print(f"Optimal Portfolio Return: {portfolio_return * 100:.3f}%")
print("Optimal Portfolio Variance:", portfolio_variance)

Optimal Weights: 
 [2.63777530e-07 5.22769343e-07 2.08749184e-07 1.50407399e-01
 3.61508835e-01 2.86748766e-05 1.38510793e-06 1.63693183e-07
 6.10383686e-07 2.41240701e-02 7.92477381e-02 4.45963321e-07
 4.91798607e-07 3.84677922e-01 1.05481612e-06 2.13928183e-07]
Optimal Portfolio Return: 0.072%
Optimal Portfolio Variance: 0.0002523578472716485


# Monte Carlo Technique

In order to visuallize the CML and the Efficient Frontier, we want to simulate random portfolios with random weights.
This allows us to obtain the CML shape for the assets we have, see how the portfolios over this market behave, and give us a plot figure to use to compare our results with from the optimization step.
This also shows just how many combinatios of INEFFICIENT portfolios there can be, and show just how we obtain a portfolio directly on the Efficient Frontier.

### Efficient Frontier Calculation

To calculate the efficient frontier, it's simply another optimization problem again.
This time we're optimizing the following function: 
$$\min \ \big[ w^T\Sigma w - qR^Tw \big] \ $$

Where in this case $R$ is the expected returns as found before, and q=1. 

And the optimization is subject to the previous constraints, plus: 
$$R = \text{Expected Returns} $$ 
The expression we're subtracting is the expected return we want from the portfolio, which may not exist. 
The result of such optimization is a portfolio that indeed lies on the Efficient Frontier, though it's not the optimal Minimum Risk or Max Sharpe Ratio.



In [11]:
# Efficient Frontier calculation:

def calculate_efficient_frontier(expected_returns, covariance_matrix, max_target_return = 0.07, num_points = 1000):
    frontier_points = []
    n_assets = len(expected_returns)
    expected_returns = expected_returns.values
    covariance_matrix = covariance_matrix 
    for target_return in np.linspace(0, max_target_return, num_points):
        
        # Define the problem as before:
        w = cp.Variable(n_assets)
        objective_function = cp.Minimize(cp.quad_form(w, covariance_matrix))
        constraints = [
            cp.sum(w) == 1,
            w >= 0,
            expected_returns @ w == target_return
        ]
        problem = cp.Problem(objective_function, constraints)
        problem.solve(solver=cp.CLARABEL)

        # Check if the solution is successful:
        if problem.status == cp.OPTIMAL:
            optimal_weights = w.value
            portfolio_variance = cp.quad_form(optimal_weights, covariance_matrix).value
            portfolio_return = expected_returns @ optimal_weights
            frontier_points.append((portfolio_return, portfolio_variance))
    return frontier_points

efficient_frontier = calculate_efficient_frontier(expected_returns, covariance_matrix)

In [12]:
# Select a seed for numpy for reproducable results.
SEED = 777 
np.random.seed(SEED)

returns = prices.pct_change()
expected_returns = returns.mean()

# We define a new class to encapsulate what a profile is for easier calculation:

class Portfolio:
    
    # Class variables shared across all objects.

    RETURNS = returns.mean()
    COV_MATRIX = returns.cov()
    RF = 0.045  # The choice was based on the Rf rate of the USD.

    def __init__(self):
        self.weights = self.generate_weights()
        
    def generate_weights(self):
        # We define a very aggressive random function, since in our case we always chose all assets instead of choosing a subset of them.
        # Using "normal" randoms here will not show the true behavior of the simulated portfolios.
        alpha = np.random.uniform(0.5, 2.0)
        beta = np.random.uniform(0.5, 2.0)
        # Generate random weights using a skewed distribution (Beta distribution).
        t = np.random.beta(alpha, beta, size=len(self.RETURNS))
        # Adjust weights to be more extreme
        t = np.power(t, 4)
        # Normalize weights to sum to 1
        return t / np.sum(t)
    
    def calculate_portfolio_return(self):
        return np.dot(self.weights, self.RETURNS)
    
    def calculate_portfolio_variance(self):
        portfolio_variance = np.dot(self.weights.T, np.dot(self.COV_MATRIX, self.weights))
        return portfolio_variance
    
    def calculate_sharpe_ratio(self):
        portfolio_return = self.calculate_portfolio_return()
        portfolio_std_dev = self.calculate_portfolio_variance()
        sharpe_ratio = (portfolio_return - self.RF) / portfolio_std_dev
        return sharpe_ratio
    
    def get_expected_return_and_risk(self):
        expected_return = self.calculate_portfolio_return()
        expected_risk = self.calculate_portfolio_variance()
        return expected_return, expected_risk
         
# Test

# p = Portfolio()
# p.get_expected_return_and_risk()



In [13]:
# Now we simulate a large number of such portfolios, and plot them. (WARNING: Any choice above 100k runs will eat a lot of RAM unless we enable GPU plotting which is not in the scope of this project).

def simulate(number_of_portfolios = 50_000):
    portfolios = []
    # Create and simulate the random portfolios:
    for _ in range(number_of_portfolios):
        portfolios.append(Portfolio())
    
    portfolio_returns = [] # y-axis for the plot.
    portfolio_risks = [] # x-axis for the plot.
    sharpe_ratios = [] # color coding the dots by their Sharpe ratio.

    for portfolio in portfolios:
        expected_return, expected_risk = portfolio.get_expected_return_and_risk()
        sharpe_ratio = portfolio.calculate_sharpe_ratio()
        portfolio_returns.append(expected_return)
        portfolio_risks.append(expected_risk)
        sharpe_ratios.append(sharpe_ratio)

    fig = go.Figure()

    # Scatter of Portfolios:
    fig.add_trace(go.Scatter(
        x = portfolio_risks,
        y = portfolio_returns,
        mode = 'markers',
        marker = dict(color = sharpe_ratios, colorscale = 'viridis', size = 2, colorbar = dict(title = 'Sharpe Ratio')),
        name = 'Simulated Portfolio'
    ))

    # Plotting the efficient frontier:
    fig.add_trace(go.Scatter(
    x = [point[1] for point in efficient_frontier],
    y = [point[0] for point in efficient_frontier],
    mode = 'lines',
    line = dict(width = 3, dash = 'dash', color = '#d62728'),
    name = 'Efficient Frontier'
    ))

    fig.add_trace(go.Scatter(
        x = (portfolio_variance,),
        y = (portfolio_return,),
        mode = 'markers',
        marker = dict(size = 25, symbol = "star", color = '#17becf' ,line = dict(width = 0.5, color = "DarkSlateGrey")),
        name = 'Minimum Volatility Portfolio'
    ))

    fig.update_layout(
    title='Simulated Portfolios and Efficient Frontier (CML)',
    title_x=0.5,
    xaxis_title='Portfolio Risk (Standard Deviation)',
    yaxis_title='Expected Portfolio Return (%)',
    template='plotly_dark',
    height=1000,
    width=1500,
    margin=dict(
        l=100, r=100, t=100, b=100
    ),
    legend=dict(
        x=0.02,
        y=0.98,
        traceorder='normal',
        font=dict(
            size=14,
            color='white',
            
        ),
        bgcolor='rgba(20, 20, 20, 0.6)',
        bordercolor='rgba(255, 255, 255, 0.5)',
        borderwidth=1
        )
    )

    fig.update_yaxes(tickformat=".2%")  # Display percentages with 2 decimal places
    fig.update_xaxes(range=[0, 0.004]) # Limit the x axis to relevant risk values.
    fig.show()

# Run the function:
simulate()

In [23]:
# Lastly, we plot the percentage to allocate to each asset as we found in the optimization step: 
 
keys = expected_returns.index.tolist()
percentages = optimal_weights

threshold = 0.01 

significant_keys = [k for k, p in zip(keys, percentages) if p >= threshold]
significant_percentages = [p for p in percentages if p >= threshold]
insignificant_percentages = [p for p in percentages if p < threshold]

others_percentage = sum(insignificant_percentages)
if others_percentage > 0:
    significant_keys.append('Others')
    significant_percentages.append(others_percentage)

pull_values = [0.2 if p < 0.02 else 0 for p in significant_percentages]

# AI Assisted color codes:
colors = ['#636EFA', '#EF553B', '#00CC96', '#AB63FA', '#FFA15A', 
          '#19D3F3', '#FF6692', '#B6E880', '#FF97FF', '#FECB52', 
          '#FF8C00', '#E5E500', '#E500E5', '#00E5E5', '#E57C00'] 

if 'Others' in significant_keys:
    others_index = significant_keys.index('Others')
    colors[others_index] = '#FF00FF'

fig = go.Figure(data=[go.Pie(
    labels = significant_keys, 
    values = significant_percentages, 
    pull = pull_values, 
    hole = 0.3,
    marker = dict(colors = colors[:len(significant_keys)], line = dict(color = '#000000', width = 2)),
    hoverinfo = 'label+percent+value',
    textinfo = 'label+percent',
    textposition = 'outside'
)])

fig.update_layout(
    template = 'plotly_dark',
    title_text = 'Portfolio Asset Allocation',
    title_x = 0.5,
    title_y = 0.95,
    title_font = dict(size=24, color='white'),
    width = 1300,
    height = 600,
    margin = dict(l=50, r=50, t=50, b=50), 
    showlegend=False, 
)

fig.show()