In [None]:
#installs necessary libraries
!pip install openpyxl
!pip install pandas
!pip install yfinance
!pip install matplotlib
!pip install scipy
!pip install scikit-learn
!pip install PyPortfolioOpt
!pip install cvxpy

sector_allocation = {
    ''
}


In [None]:
import pandas as pd
import yfinance as yf
from datetime import datetime, timedelta
import pathlib
import numpy as np
import matplotlib.pyplot as plt
import scipy.optimize as sci_opt

from pprint import pprint
from sklearn.preprocessing import StandardScaler

# Set some display options for Pandas.
%config InlineBackend.figure_format ='retina'
pd.set_option('display.max_colwidth', None)
pd.set_option('expand_frame_repr', False)
pd.set_option('display.max_columns', None)



## Fetching S&P 500 Companies:
sp500_companies = pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')[0]
sp500_companies['Symbol'] = sp500_companies['Symbol'].replace({'BRK.B': 'BRK-B', 'BF.B': 'BF-B'})
sp500_companies
# Dropping NaN values
sp500_companies.dropna(inplace=True)
## Getting Unique Sectors and Dates:
unique_sectors = sp500_companies['GICS Sector'].unique()
end_date = datetime.today().strftime('%Y-%m-%d')
start_date = (datetime.today() - timedelta(days=90)).strftime('%Y-%m-%d')
unique_sectors_df = pd.DataFrame(unique_sectors)
print(unique_sectors_df)
number_of_sectors = len(unique_sectors)
len(unique_sectors)

## Fetching Historical Prices and Calculating Averages:
## The code fetches historical closing prices for each company in each sector and calculates the average closing price for the sector.
# Loop through each sector and download closing prices
all_avg_prices = []
all_dates = []
all_sectors = []
for sector in unique_sectors:
    companies = sp500_companies[sp500_companies['GICS Sector'] == sector]['Symbol'].tolist()
    prices = yf.download(companies, start=start_date, end=end_date)['Adj Close']
    avg_prices = prices.mean(axis=1)
    # Append the data
    all_avg_prices.extend(avg_prices.tolist())
    all_dates.extend([date.date() for date in prices.index])
    all_sectors.extend([sector] * len(avg_prices))
    
# Create a DataFrame
sp500_sectors_avg_prices = pd.DataFrame({
    'Date': all_dates,
    'Avg_Close': all_avg_prices,
    'Sector': all_sectors
})


#sp500_sectors_avg_prices = sp500_sectors_avg_prices.pivot(index='Date', columns='Sector', values='Avg_Close')
# Reverse the order of the DataFrame by dates
sp500_sectors_avg_prices = sp500_sectors_avg_prices[::-1]

print(sp500_sectors_avg_prices.head(100))
sp500_sectors_avg_prices = sp500_sectors_avg_prices.pivot(
    index='Date',
    columns='Sector',
    values='Avg_Close'
)
print(sp500_sectors_avg_prices.head())
sp500_sectors_avg_prices.dropna(inplace=True)
print(sp500_sectors_avg_prices.isnull().sum())
sp500_sectors_avg_prices.to_excel('sp500_sectors_avg_prices.xlsx', index=True)

# Calculate the Log of returns.
log_return = np.log(1 +sp500_sectors_avg_prices.pct_change().iloc[::-1])

# Drop rows with negative values
log_return = log_return[(log_return > 0).all(axis=1)]


# Generate Random Weights.
random_weights = np.array(np.random.random(number_of_sectors))

# Generate the Rebalance Weights, these should equal 1.
rebalance_weights = random_weights / np.sum(random_weights)

print('Log Returns:')
print(log_return.head())
print((sp500_sectors_avg_prices == 0).sum())
log_return.to_excel('log_return.xlsx', index=True)
# Calculate the percentage of negative log returns
percentage_negative_returns = (log_return < 0).mean().mean() * 100

print(f"Percentage of negative log returns: {percentage_negative_returns:.2f}%")


# Calculate the Expected Returns, annualize it by multiplying it by `252`.
risk_free_rate = .01
exp_ret = np.sum(((log_return.mean()-risk_free_rate) * rebalance_weights) * 252)

# Calculate the Expected Volatility, annualize it by multiplying it by `252`.
exp_vol = np.sqrt(
np.dot(
    rebalance_weights.T,
    np.dot(
        log_return.cov() * 252,
        rebalance_weights
    )
)
)

# Calculate the Sharpe Ratio.
sharpe_ratio = exp_ret / exp_vol

# Put the weights into a data frame to see them better.
weights_df = pd.DataFrame(data={
'random_weights': random_weights,
'rebalance_weights': rebalance_weights
})
print('')
print('='*80)
print('PORTFOLIO WEIGHTS:')
print('-'*80)
print(weights_df)
print('-'*80)

# Do the same with the other metrics.
metrics_df = pd.DataFrame(data={
    'Expected Portfolio Returns': exp_ret,
    'Expected Portfolio Volatility': exp_vol,
    'Portfolio Sharpe Ratio': sharpe_ratio
}, index=[0])

print('')
print('='*80)
print('PORTFOLIO METRICS:')
print('-'*80)
print(metrics_df)
print('-'*80)

# Initialize the components, to run a Monte Carlo Simulation.

# We will run 5000 iterations.
num_of_portfolios = 20000

# Prep an array to store the weights as they are generated, 5000 iterations for each of our 4 symbols.
all_weights = np.zeros((num_of_portfolios, number_of_sectors))

# Prep an array to store the returns as they are generated, 5000 possible return values.
ret_arr = np.zeros(num_of_portfolios)

# Prep an array to store the volatilities as they are generated, 5000 possible volatility values.
vol_arr = np.zeros(num_of_portfolios)

# Prep an array to store the sharpe ratios as they are generated, 5000 possible Sharpe Ratios.
sharpe_arr = np.zeros(num_of_portfolios)

# Start the simulations.
for ind in range(num_of_portfolios):

    # First, calculate the weights.
    weights = np.array(np.random.random(number_of_sectors))
    weights = weights / np.sum(weights)

    # Add the weights, to the `weights_arrays`.
    all_weights[ind, :] = weights

    # Calculate the expected log returns, and add them to the `returns_array`.
    ret_arr[ind] = np.sum(((log_return.mean()-risk_free_rate) * weights) * 252)

    # Calculate the volatility, and add them to the `volatility_array`.
    vol_arr[ind] = np.sqrt(
        np.dot(weights.T, np.dot(log_return.cov() * 252, weights)))        


    # Calculate the Sharpe Ratio and Add it to the `sharpe_ratio_array`.
    sharpe_arr[ind] = ret_arr[ind]/vol_arr[ind]

# Let's create our "Master Data Frame", with the weights, the returns, the volatility, and the Sharpe Ratio
simulations_data = [ret_arr, vol_arr, sharpe_arr, all_weights]

# Create a DataFrame from it, then Transpose it so it looks like our original one.
simulations_df = pd.DataFrame(data=simulations_data).T

# Give the columns the Proper Names.
simulations_df.columns = [
    'Returns',
    'Volatility',
    'Sharpe Ratio',
    'Portfolio Weights'
]

# Make sure the data types are correct, we don't want our floats to be strings.
simulations_df = simulations_df.infer_objects()

# Print out the results.
print('')
print('='*80)
print('SIMULATIONS RESULT:')
print('-'*80)

# Print PORTFOLIO WEIGHTS
print('='*80)
print('PORTFOLIO WEIGHTS:')
print('-'*80)
print(simulations_df['Portfolio Weights'].head())
print('-'*80)

# Print PORTFOLIO METRICS
print('='*80)
print('PORTFOLIO METRICS:')
print('-'*80)
print(simulations_df[['Returns', 'Volatility', 'Sharpe Ratio']].head())
print('-'*80)

# Return the Max Sharpe Ratio from the run.
max_sharpe_ratio = simulations_df.loc[simulations_df['Sharpe Ratio'].idxmax()]

# Return the Min Volatility from the run.
min_volatility = simulations_df.loc[simulations_df['Volatility'].idxmin()]

print('')
print('='*80)
print('MAX SHARPE RATIO:')
print('-'*80)
print(max_sharpe_ratio)
print('-'*80)

print('')
print('='*80)
print('MIN VOLATILITY:')
print('-'*80)
print(min_volatility)
print('-'*80)
#### Machine learning time
#We can further improve these portfolios by generating future data via machine learning.
simulations_df

In [None]:
"""
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

print("Unique values in 'Portfolio Weights' column:")
print(simulations_df["Portfolio Weights"].unique())

simulations_df["Portfolio Weights"] = simulations_df["Portfolio Weights"].apply(lambda x: np.concatenate(x) if isinstance(x, list) else x)
X = simulations_df[["Volatility", "Sharpe Ratio"]]
y = np.concatenate(simulations_df["Portfolio Weights"].to_numpy())

print("Shape of X:", X.shape)
print("Shape of y:", y.shape)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)
"""