In [4]:
# Import libraries

from pathlib import Path
import os
import requests
import pandas as pd
from dotenv import load_dotenv
import alpaca_trade_api as tradeapi
import datetime as dt

%matplotlib inline

In [None]:
# Read csv into pandas df

sp_csv = pd.read_csv(Path("sp500.csv"))
sp_csv.head()

In [None]:
# Grab tickers to send into Alpaca API

sp_tickers = sp_csv.loc[:, "Symbol"]
sp_tickers.head()

In [None]:
# Put tickers into groups inline with Alpaca API call constaints (100 calls per request)

sp_tickers = {

"tickers_1": list(sp_tickers[:100]),
"tickers_2":list(sp_tickers[100:200]),
"tickers_3":list(sp_tickers[100:200]),
"tickers_4":list(sp_tickers[300:400]),
"tickers_5":list(sp_tickers[400:500]),
"tickers_6": list(sp_tickers[500:])
}

In [None]:
# Load env file

load_dotenv()

In [None]:
# Set Alpaca API key and secret
alpaca_api_key = os.getenv("ALPACA_API_KEY")
alpaca_secret_key = os.getenv("ALPACA_SECRET_KEY")

In [7]:
# Create the Alpaca API object
alpaca = tradeapi.REST(
    alpaca_api_key,
    alpaca_secret_key,
    api_version="v2")

In [8]:
# Format current date as ISO format
start = pd.Timestamp("2012-01-01", tz="America/New_York").isoformat()
end = pd.Timestamp("2017-01-01", tz="America/New_York").isoformat()

In [9]:
# Set timeframe to one day ('1D') for the Alpaca API
timeframe = "1D"

In [10]:
# Get current historical closing prices of tickers listed (100 at a time)
sp_df_1 = alpaca.get_barset(
    sp_tickers["tickers_1"],
    timeframe,
    start = start,
    end = end
).df

In [11]:
# Define function to remove unnecessary metrics, and keep historical close prices and ticker names ONLY
# Make sure to only use this ONCE per df, otherwise restart the kernel

def clean_alpaca_df(df):
    df.drop(["open", "high", "low", "volume"], axis=1, level=1, inplace=True)
    df_clean = df.droplevel(axis=1, level=1)
    return df_clean 

In [12]:
# Save cleaned df into df that was passed into the function (save the changes, in particular the dropping of the 'close' level)

sp_df_1 = clean_alpaca_df(sp_df_1)

In [13]:
sp_df_1.head()

Unnamed: 0,A,AAL,AAP,AAPL,ABBV,ABC,ABMD,ABT,ACN,ADBE,...,CPB,CTL,GOOG,GOOGL,KMX,LNT,MMM,MO,SCHW,T
2012-01-03 05:00:00+00:00,36.48,,69.11,8.391,,38.08,18.22,27.209,52.96,28.61,...,32.97,37.32,332.65,332.983,30.52,21.75,83.46,28.55,11.71,30.38
2012-01-04 05:00:00+00:00,36.18,,70.69,8.437,,38.2,18.1,27.099,52.93,28.28,...,32.72,22.18,334.13,334.469,30.72,21.62,84.18,28.39,11.74,30.46
2012-01-05 05:00:00+00:00,37.01,,71.2,8.53,,38.48,18.27,27.042,51.94,28.48,...,31.83,36.79,329.5,329.829,30.92,21.825,83.79,28.84,11.93,30.4
2012-01-06 05:00:00+00:00,37.43,,71.52,8.621,,38.59,18.15,26.802,51.82,28.75,...,31.45,22.14,325.0,325.325,30.3,21.7,83.43,28.72,12.04,29.68
2012-01-09 05:00:00+00:00,38.37,,71.46,8.606,,38.58,18.2,26.792,51.72,28.51,...,31.83,22.03,311.27,311.684,30.07,21.665,83.86,28.506,12.12,29.66


In [14]:
# Repeat steps above to call remaining tickers from Alpaca

sp_df_2 = alpaca.get_barset(
    sp_tickers["tickers_2"],
    timeframe,
    start = start,
    end = end
).df

sp_df_3 = alpaca.get_barset(
    sp_tickers["tickers_3"],
    timeframe,
    start = start,
    end = end
).df

sp_df_4 = alpaca.get_barset(
    sp_tickers["tickers_4"],
    timeframe,
    start = start,
    end = end
).df

sp_df_5 = alpaca.get_barset(
    sp_tickers["tickers_5"],
    timeframe,
    start = start,
    end = end
).df

sp_df_6 = alpaca.get_barset(
    sp_tickers["tickers_6"],
    timeframe,
    start = start,
    end = end
).df

In [15]:
sp_df_2 = clean_alpaca_df(sp_df_2)
sp_df_3 = clean_alpaca_df(sp_df_3)
sp_df_4 = clean_alpaca_df(sp_df_4)
sp_df_5 = clean_alpaca_df(sp_df_5)
sp_df_6 = clean_alpaca_df(sp_df_6)

In [16]:
sp_df_2.head()

Unnamed: 0,C,CAG,CB,CCI,CFG,CHD,CI,CINF,CL,CMA,...,FRC,FRT,FTNT,FTV,GLW,KO,RE,STZ,XOM,XRAY
2012-01-03 05:00:00+00:00,27.27,26.28,70.15,44.91,,22.7,43.23,30.59,22.765,26.71,...,30.81,90.75,20.77,,13.03,35.06,85.13,20.76,85.94,34.17
2012-01-04 05:00:00+00:00,27.11,26.5,68.91,45.01,,10.065,43.23,30.46,45.3,26.8,...,31.27,89.25,20.1,,13.16,34.845,83.35,20.45,86.05,34.16
2012-01-05 05:00:00+00:00,28.52,26.5,69.74,44.97,,22.575,43.07,30.87,22.535,27.38,...,31.19,90.15,20.2,,13.3,34.685,83.95,19.72,85.73,34.19
2012-01-06 05:00:00+00:00,28.56,26.41,69.29,45.05,,22.935,43.61,30.68,22.45,27.92,...,30.99,89.95,20.41,,13.51,34.465,84.0028,19.65,85.21,33.98
2012-01-09 05:00:00+00:00,29.07,26.53,69.6,45.64,,22.975,43.7599,30.94,44.875,28.36,...,31.26,89.54,20.16,,13.73,34.45,84.97,20.02,85.47,33.82


In [17]:
sp_combined_df = pd.concat([sp_df_1, sp_df_2, sp_df_3, sp_df_4, sp_df_5, sp_df_6], axis=1)

In [20]:
sp_combined_df.shape

(1258, 505)

In [21]:
sp_combined_df.head()

Unnamed: 0,A,AAL,AAP,AAPL,ABBV,ABC,ABMD,ABT,ACN,ADBE,...,WYNN,XEL,XLNX,XRX,XYL,YUM,ZBH,ZBRA,ZION,ZTS
2012-01-03 05:00:00+00:00,36.48,,69.11,8.391,,38.08,18.22,27.209,52.96,28.61,...,113.97,27.33,32.44,32.76,25.1,58.55,53.92,35.72,16.8,
2012-01-04 05:00:00+00:00,36.18,,70.69,8.437,,38.2,18.1,27.099,52.93,28.28,...,112.01,27.17,32.095,32.6,25.93,58.99,53.21,35.45,17.15,
2012-01-05 05:00:00+00:00,37.01,,71.2,8.53,,38.48,18.27,27.042,51.94,28.48,...,110.85,27.26,32.37,32.4,25.6,59.41,53.85,35.4,17.62,
2012-01-06 05:00:00+00:00,37.43,,71.52,8.621,,38.59,18.15,26.802,51.82,28.75,...,106.94,27.19,32.16,32.36,25.17,59.88,53.97,35.11,17.63,
2012-01-09 05:00:00+00:00,38.37,,71.46,8.606,,38.58,18.2,26.792,51.72,28.51,...,106.64,27.21,32.87,32.32,25.25,59.6,54.35,34.89,17.87,


In [24]:
sp_combined_df.to_csv("sp500_combined.csv")

In [25]:
sp_combined_df = sp_combined_df.set_index(pd.to_datetime(sp_combined_df.index, infer_datetime_format=True))

In [26]:
sp_combined_df.head()

Unnamed: 0,A,AAL,AAP,AAPL,ABBV,ABC,ABMD,ABT,ACN,ADBE,...,WYNN,XEL,XLNX,XRX,XYL,YUM,ZBH,ZBRA,ZION,ZTS
2012-01-03 05:00:00+00:00,36.48,,69.11,8.391,,38.08,18.22,27.209,52.96,28.61,...,113.97,27.33,32.44,32.76,25.1,58.55,53.92,35.72,16.8,
2012-01-04 05:00:00+00:00,36.18,,70.69,8.437,,38.2,18.1,27.099,52.93,28.28,...,112.01,27.17,32.095,32.6,25.93,58.99,53.21,35.45,17.15,
2012-01-05 05:00:00+00:00,37.01,,71.2,8.53,,38.48,18.27,27.042,51.94,28.48,...,110.85,27.26,32.37,32.4,25.6,59.41,53.85,35.4,17.62,
2012-01-06 05:00:00+00:00,37.43,,71.52,8.621,,38.59,18.15,26.802,51.82,28.75,...,106.94,27.19,32.16,32.36,25.17,59.88,53.97,35.11,17.63,
2012-01-09 05:00:00+00:00,38.37,,71.46,8.606,,38.58,18.2,26.792,51.72,28.51,...,106.64,27.21,32.87,32.32,25.25,59.6,54.35,34.89,17.87,


In [5]:
sp_path = Path("sp500_combined.csv")
sp_combined_df = pd.read_csv(sp_path, index_col="Date", infer_datetime_format=True, parse_dates = True)
sp_combined_df.head(5)

Unnamed: 0_level_0,A,AAL,AAP,AAPL,ABBV,ABC,ABMD,ABT,ACN,ADBE,...,WYNN,XEL,XLNX,XRX,XYL,YUM,ZBH,ZBRA,ZION,ZTS
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2012-01-03 05:00:00+00:00,36.48,,69.11,8.391,,38.08,18.22,27.209,52.96,28.61,...,113.97,27.33,32.44,32.76,25.1,58.55,53.92,35.72,16.8,
2012-01-04 05:00:00+00:00,36.18,,70.69,8.437,,38.2,18.1,27.099,52.93,28.28,...,112.01,27.17,32.095,32.6,25.93,58.99,53.21,35.45,17.15,
2012-01-05 05:00:00+00:00,37.01,,71.2,8.53,,38.48,18.27,27.042,51.94,28.48,...,110.85,27.26,32.37,32.4,25.6,59.41,53.85,35.4,17.62,
2012-01-06 05:00:00+00:00,37.43,,71.52,8.621,,38.59,18.15,26.802,51.82,28.75,...,106.94,27.19,32.16,32.36,25.17,59.88,53.97,35.11,17.63,
2012-01-09 05:00:00+00:00,38.37,,71.46,8.606,,38.58,18.2,26.792,51.72,28.51,...,106.64,27.21,32.87,32.32,25.25,59.6,54.35,34.89,17.87,


In [6]:
# Check to confirm whether there are any duplicates in the index variable
sp_combined_df.duplicated()

Date
2012-01-03 05:00:00+00:00    False
2012-01-04 05:00:00+00:00    False
2012-01-05 05:00:00+00:00    False
2012-01-06 05:00:00+00:00    False
2012-01-09 05:00:00+00:00    False
                             ...  
2016-12-23 05:00:00+00:00    False
2016-12-27 05:00:00+00:00    False
2016-12-28 05:00:00+00:00    False
2016-12-29 05:00:00+00:00    False
2016-12-30 05:00:00+00:00    False
Length: 1258, dtype: bool

In [7]:
sp_combined_df.head(5)

Unnamed: 0_level_0,A,AAL,AAP,AAPL,ABBV,ABC,ABMD,ABT,ACN,ADBE,...,WYNN,XEL,XLNX,XRX,XYL,YUM,ZBH,ZBRA,ZION,ZTS
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2012-01-03 05:00:00+00:00,36.48,,69.11,8.391,,38.08,18.22,27.209,52.96,28.61,...,113.97,27.33,32.44,32.76,25.1,58.55,53.92,35.72,16.8,
2012-01-04 05:00:00+00:00,36.18,,70.69,8.437,,38.2,18.1,27.099,52.93,28.28,...,112.01,27.17,32.095,32.6,25.93,58.99,53.21,35.45,17.15,
2012-01-05 05:00:00+00:00,37.01,,71.2,8.53,,38.48,18.27,27.042,51.94,28.48,...,110.85,27.26,32.37,32.4,25.6,59.41,53.85,35.4,17.62,
2012-01-06 05:00:00+00:00,37.43,,71.52,8.621,,38.59,18.15,26.802,51.82,28.75,...,106.94,27.19,32.16,32.36,25.17,59.88,53.97,35.11,17.63,
2012-01-09 05:00:00+00:00,38.37,,71.46,8.606,,38.58,18.2,26.792,51.72,28.51,...,106.64,27.21,32.87,32.32,25.25,59.6,54.35,34.89,17.87,


In [8]:
sp_combined_df.to_csv("sp500_clean.csv")

In [9]:
sp_csv_clean = pd.read_csv(Path("sp500_clean.csv"),index_col='Date',infer_datetime_format=True,parse_dates=True)
sp_csv_clean.head()

Unnamed: 0_level_0,A,AAL,AAP,AAPL,ABBV,ABC,ABMD,ABT,ACN,ADBE,...,WYNN,XEL,XLNX,XRX,XYL,YUM,ZBH,ZBRA,ZION,ZTS
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2012-01-03 05:00:00+00:00,36.48,,69.11,8.391,,38.08,18.22,27.209,52.96,28.61,...,113.97,27.33,32.44,32.76,25.1,58.55,53.92,35.72,16.8,
2012-01-04 05:00:00+00:00,36.18,,70.69,8.437,,38.2,18.1,27.099,52.93,28.28,...,112.01,27.17,32.095,32.6,25.93,58.99,53.21,35.45,17.15,
2012-01-05 05:00:00+00:00,37.01,,71.2,8.53,,38.48,18.27,27.042,51.94,28.48,...,110.85,27.26,32.37,32.4,25.6,59.41,53.85,35.4,17.62,
2012-01-06 05:00:00+00:00,37.43,,71.52,8.621,,38.59,18.15,26.802,51.82,28.75,...,106.94,27.19,32.16,32.36,25.17,59.88,53.97,35.11,17.63,
2012-01-09 05:00:00+00:00,38.37,,71.46,8.606,,38.58,18.2,26.792,51.72,28.51,...,106.64,27.21,32.87,32.32,25.25,59.6,54.35,34.89,17.87,


In [10]:
sp_hist_data = pd.read_csv(Path("S&P 500 Historical Data.csv"),index_col='Date',infer_datetime_format=True,parse_dates=True).sort_index()


In [11]:
sp_hist_data

Unnamed: 0_level_0,Price,Open,High,Low,Vol.,Change %
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2012-01-03,1277.06,1258.86,1284.62,1258.86,-,1.55%
2012-01-04,1277.30,1277.03,1278.73,1268.10,-,0.02%
2012-01-05,1281.06,1277.30,1283.05,1265.26,-,0.29%
2012-01-06,1277.81,1280.93,1281.84,1273.34,-,-0.25%
2012-01-09,1280.70,1277.83,1281.99,1274.55,-,0.23%
...,...,...,...,...,...,...
2016-12-23,2263.79,2260.25,2263.79,2258.84,-,0.13%
2016-12-27,2268.88,2266.23,2273.82,2266.15,-,0.22%
2016-12-28,2249.92,2270.23,2271.31,2249.11,-,-0.84%
2016-12-29,2249.26,2249.50,2254.51,2244.56,-,-0.03%


In [12]:
df_with_sp=sp_hist_data.drop(['Open','High','Low','Vol.','Change %'],axis=1)

In [13]:
sp_index=df_with_sp.reset_index()

In [14]:
sp_index

Unnamed: 0,Date,Price
0,2012-01-03,1277.06
1,2012-01-04,1277.30
2,2012-01-05,1281.06
3,2012-01-06,1277.81
4,2012-01-09,1280.70
...,...,...
1253,2016-12-23,2263.79
1254,2016-12-27,2268.88
1255,2016-12-28,2249.92
1256,2016-12-29,2249.26


In [15]:
df_index=sp_csv_clean.reset_index().drop(columns=["Date"])

In [16]:
df_index

Unnamed: 0,A,AAL,AAP,AAPL,ABBV,ABC,ABMD,ABT,ACN,ADBE,...,WYNN,XEL,XLNX,XRX,XYL,YUM,ZBH,ZBRA,ZION,ZTS
0,36.48,,69.11,8.391,,38.08,18.22,27.209,52.96,28.61,...,113.97,27.33,32.440,32.76,25.10,58.55,53.92,35.72,16.80,
1,36.18,,70.69,8.437,,38.20,18.10,27.099,52.93,28.28,...,112.01,27.17,32.095,32.60,25.93,58.99,53.21,35.45,17.15,
2,37.01,,71.20,8.530,,38.48,18.27,27.042,51.94,28.48,...,110.85,27.26,32.370,32.40,25.60,59.41,53.85,35.40,17.62,
3,37.43,,71.52,8.621,,38.59,18.15,26.802,51.82,28.75,...,106.94,27.19,32.160,32.36,25.17,59.88,53.97,35.11,17.63,
4,38.37,,71.46,8.606,,38.58,18.20,26.792,51.72,28.51,...,106.64,27.21,32.870,32.32,25.25,59.60,54.35,34.89,17.87,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1253,46.19,48.48,170.88,116.510,59.10,78.56,112.27,38.410,117.48,105.02,...,88.36,40.66,60.550,35.80,51.00,63.81,102.32,86.41,43.86,53.77
1254,46.50,48.09,171.82,117.250,59.26,78.83,113.70,38.600,117.57,104.98,...,88.24,40.72,61.240,35.74,50.86,64.04,103.14,87.04,43.94,53.72
1255,45.72,47.67,170.45,116.750,62.28,78.33,112.78,38.210,116.62,103.77,...,87.08,40.30,60.630,35.02,49.94,63.73,103.06,85.62,43.28,53.44
1256,45.65,46.75,170.28,116.730,59.47,79.34,113.66,38.330,117.01,103.69,...,87.17,40.95,60.740,34.88,49.76,63.64,103.50,85.98,42.79,53.62


In [17]:
df_all=pd.concat([sp_index,df_index],axis=1,join='inner')

In [18]:
df_all=df_all.set_index("Date")

In [19]:
df_all=df_all.rename(columns={"Price":"SP500"})

In [20]:
df_all.isnull().mean() *100

SP500     0.00000
A         0.00000
AAL      38.63275
AAP       0.00000
AAPL      0.00000
           ...   
YUM       0.00000
ZBH       0.00000
ZBRA      0.00000
ZION      0.00000
ZTS      21.54213
Length: 506, dtype: float64

In [21]:
df_all.isnull().sum()

SP500      0
A          0
AAL      486
AAP        0
AAPL       0
        ... 
YUM        0
ZBH        0
ZBRA       0
ZION       0
ZTS      271
Length: 506, dtype: int64

In [22]:
# Exploring data frame in preparation for cleaning of data
df_all.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1258 entries, 2012-01-03 to 2016-12-30
Columns: 506 entries, SP500 to ZTS
dtypes: float64(505), object(1)
memory usage: 4.9+ MB


In [23]:
# Confirming data types in dataframe are all float values
df_all.dtypes

SP500     object
A        float64
AAL      float64
AAP      float64
AAPL     float64
          ...   
YUM      float64
ZBH      float64
ZBRA     float64
ZION     float64
ZTS      float64
Length: 506, dtype: object

In [24]:
# Confirming maximum amount of data, i.e. number of trading days
# that will be included in the period is 1258
df_all.count()

SP500    1258
A        1258
AAL       772
AAP      1258
AAPL     1258
         ... 
YUM      1258
ZBH      1258
ZBRA     1258
ZION     1258
ZTS       987
Length: 506, dtype: int64

In [25]:
# Checking for null values in data set (anticipate there will be some)
# with stocks that were not present in the S&P 500 for the full time

df_all.isnull()

Unnamed: 0_level_0,SP500,A,AAL,AAP,AAPL,ABBV,ABC,ABMD,ABT,ACN,...,WYNN,XEL,XLNX,XRX,XYL,YUM,ZBH,ZBRA,ZION,ZTS
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2012-01-03,False,False,True,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
2012-01-04,False,False,True,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
2012-01-05,False,False,True,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
2012-01-06,False,False,True,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
2012-01-09,False,False,True,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2016-12-23,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2016-12-27,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2016-12-28,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2016-12-29,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [26]:
df_all.isnull().mean() *100

SP500     0.00000
A         0.00000
AAL      38.63275
AAP       0.00000
AAPL      0.00000
           ...   
YUM       0.00000
ZBH       0.00000
ZBRA      0.00000
ZION      0.00000
ZTS      21.54213
Length: 506, dtype: float64

In [27]:
# Cleaning data - given numeric in nature, all NaN values
# will be converted to '0' values

df_all_no_null = df_all.dropna(axis=1)

In [28]:
# Confirm cleanse of data worked by re-running check for null values
df_all_no_null.isnull().sum()

SP500    0
A        0
AAP      0
AAPL     0
ABC      0
        ..
XYL      0
YUM      0
ZBH      0
ZBRA     0
ZION     0
Length: 454, dtype: int64

In [29]:
df_all_no_null.head(10)

Unnamed: 0_level_0,SP500,A,AAP,AAPL,ABC,ABT,ACN,ADBE,ADI,ADM,...,WY,WYNN,XEL,XLNX,XRX,XYL,YUM,ZBH,ZBRA,ZION
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2012-01-03,1277.06,36.48,69.11,8.391,38.08,27.209,52.96,28.61,36.07,24.41,...,19.11,113.97,27.33,32.44,32.76,25.1,58.55,53.92,35.72,16.8
2012-01-04,1277.3,36.18,70.69,8.437,38.2,27.099,52.93,28.28,35.97,24.81,...,18.89,112.01,27.17,32.095,32.6,25.93,58.99,53.21,35.45,17.15
2012-01-05,1281.06,37.01,71.2,8.53,38.48,27.042,51.94,28.48,36.15,29.25,...,18.8,110.85,27.26,32.37,32.4,25.6,59.41,53.85,35.4,17.62
2012-01-06,1277.81,37.43,71.52,8.621,38.59,26.802,51.82,28.75,35.85,24.56,...,19.01,106.94,27.19,32.16,32.36,25.17,59.88,53.97,35.11,17.63
2012-01-09,1280.7,38.37,71.46,8.606,38.58,26.792,51.72,28.51,36.59,24.42,...,18.8,106.64,27.21,32.87,32.32,25.25,59.6,54.35,34.89,17.87
2012-01-10,1292.08,39.09,71.54,8.637,39.01,11.199,52.66,29.19,36.75,24.31,...,19.19,107.53,27.44,32.93,32.44,25.59,59.98,55.52,35.29,18.18
2012-01-11,1292.48,38.91,72.03,8.623,38.62,26.6,53.63,29.33,36.93,24.44,...,19.71,111.96,27.1,32.85,32.7,26.07,60.91,55.29,35.57,18.49
2012-01-12,1295.5,39.5,71.64,8.597,38.85,26.461,54.05,29.21,37.3,24.62,...,19.86,109.82,26.69,33.45,32.52,25.695,60.88,55.05,34.96,18.65
2012-01-13,1289.09,39.09,71.48,8.569,37.8,26.61,53.28,28.97,36.61,29.15,...,20.24,107.83,26.72,33.0,32.32,25.63,61.23,55.05,34.97,18.52
2012-01-17,1293.67,39.99,72.1,60.671,38.28,26.734,53.48,29.35,37.02,24.31,...,20.28,111.9,26.68,33.34,32.32,25.63,61.88,55.17,35.52,18.42


In [30]:
df_all_no_null.to_csv("final_clean_df.csv")

In [31]:
# Creating portfolio that is just the S&P 500 returns
portfolio_sp500 = df_all_no_null.loc[:,:'SP500']
portfolio_sp500.head(5)

Unnamed: 0_level_0,SP500
Date,Unnamed: 1_level_1
2012-01-03,1277.06
2012-01-04,1277.3
2012-01-05,1281.06
2012-01-06,1277.81
2012-01-09,1280.7


In [None]:
# Creating portfolio that is made up of the 50 highest volaility stocks
portfolio_high_vol = df_all_no_null.nlargest(50, 'Volatility')
portfolio_high_vol.head(5)

In [None]:
# Creating portfolio that is made up of the 50 smallest volaility stocks
portfolio_low_vol = df_all_no_null.nsmallest(50, 'Volatility')
portfolio_low_vol.head(5)

In [32]:
# Create list of random numbers 1-485 and loop through list in order to create dataframe of random stocks
import random

random_num_list = random.sample(range(400), 20)
print(random_num_list)

[177, 37, 327, 343, 148, 384, 344, 54, 379, 273, 219, 316, 111, 5, 7, 66, 257, 225, 113, 387]


In [None]:
portfolio_random = []

for i in random_num_list:
    porfolio_random.concatenate[]