In [None]:
# This is a function to generate response variable dataframe
# Decision rule: 
# If the change in "High" in 5 days is greater than 3%, we give it label "buy"
# If the change in "High" in 5 days is smaller than -3%, we give it label "sell"
# Otherwise, we give it label "hold
def create_response(start_date,end_date,stock_name):
    import pandas as pd
    import numpy as np
    import yfinance as yf
    # get stock info
    data = yf.download(stock_name, start=start_date, end=end_date,interval='1h')
    # create percentage dict
    percent_dict = {}
    for i in range(len(data)-5):
        cur_high = data.iloc[i,:]['High']
        future_high = data.iloc[i+5,:]['High']
        percent_dict[data.index[i]] = (future_high -cur_high)/cur_high
    response_df = pd.DataFrame.from_dict(percent_dict, orient='index',columns=['percent_change'])
    # Decsion rule
    bins = [-(np.inf), -0.03, 0.03, np.inf]
    names = ['sell', 'hold', 'buy']
    response_df['action'] = pd.cut(response_df['percent_change'], bins, labels=names)
    response_df = response_df.reset_index()
    return response_df

In [1]:
def download_stock_data(stock,lookforward,end_date):
    import pandas as pd
    import numpy as np
    import yfinance as yf
    data = yf.download(stock, start="2020-10-02", 
                       end=end_date,interval='60m')
    data = data.reset_index()
    data['Datetime'] = data['Datetime'].astype('str')
    time = []
    for i, row in data.iterrows():
        time.append(row['Datetime'][:-6])
    data['Time']=time
    data['Time'] = pd.to_datetime(data['Time'])
    data.drop(['Datetime','High','Low','Adj Close', 'Volume'],axis=1,inplace=True)
    cols = ['Time','Open','Close']
    data=data[cols]
    data.set_index('Time')
    time_open = dict(zip(data.Time,data.Open))
    time_close = dict(zip(data.Time,data.Close))
    Time_complete = [data.Time[0]]
    start = data.Time[0]
    end = data.Time[len(data)-1]
    from datetime import datetime, timedelta
    while start < end:
        ele = start + timedelta(hours=1)
        Time_complete.append(ele)
        start = ele
    df_time = pd.DataFrame(Time_complete,columns=['Time'])
    weekday_lst = []
    for i,row in df_time.iterrows():
        cur = row['Time']
        weekday_lst.append(cur.weekday())
    df_time['weekday'] = weekday_lst
    price = []
    from datetime import datetime, timedelta
    for i,row in df_time.iterrows():
        cur = row['Time']
        day = row['weekday']
        if cur in time_open:
            price.append(time_open[cur])
        elif ("16:30:00" in str(cur) ) :
            cur -= timedelta(hours=1)
            if cur in time_close:
                price.append(time_close[cur])
            else:
                price.append(None)
        else:
            price.append(None)
    df_time['price'] = price
    df_time = df_time.fillna(method='ffill')
    time_price = dict(zip(df_time.Time,df_time.price))
    future_price = []
    for i, row in df_time.iterrows():
        key = row['Time']+timedelta(hours=lookforward)
        day = row['weekday']
#         if key.weekday()>4:
#             if key.weekday() == 5:
#                 key += timedelta(hours=48)
#             else:
#                 key += timedelta(hours=24)
        if key in time_price:
            future_price.append(time_price[key])
        else:
            future_price.append(None)
    df_time['future_price'] = future_price
    name = 'percent_change_' + str(lookforward)
    df_time[name] = ((df_time['future_price'] / df_time['price'])-1)*100
    return df_time

In [2]:
future_24 = download_stock_data("NVDA",24,"2021-06-01")
future_48 = download_stock_data("NVDA",48,"2021-06-01")
future_72 = download_stock_data("NVDA",72,"2021-06-01")
future_96 = download_stock_data("NVDA",96,"2021-06-01")

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


In [None]:
future_96

In [6]:
import pandas as pd

In [7]:
frame = [future_24.iloc[:,[0,1,2,4]],future_48.iloc[:,4],future_72.iloc[:,4],future_96.iloc[:,4]]
df = pd.concat(frame,axis=1)

In [8]:
df

Unnamed: 0,Time,weekday,price,percent_change_24,percent_change_48,percent_change_72,percent_change_96
0,2020-10-02 09:30:00,4,529.000000,-1.232518,-1.232518,0.083176,4.574671
1,2020-10-02 10:30:00,4,539.695007,-3.189770,-3.189770,0.462297,4.352456
2,2020-10-02 11:30:00,4,533.020020,-1.977419,-1.977419,1.378930,4.202462
3,2020-10-02 12:30:00,4,531.131287,-1.628845,-1.628845,2.396526,5.256460
4,2020-10-02 13:30:00,4,529.250000,-1.279172,-1.279172,2.638645,5.963155
...,...,...,...,...,...,...,...
4706,2021-04-16 11:30:00,4,637.080017,,,,
4707,2021-04-16 12:30:00,4,640.500000,,,,
4708,2021-04-16 13:30:00,4,638.159973,,,,
4709,2021-04-16 14:30:00,4,636.869995,,,,


In [17]:
df.iloc[:4614,:].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4614 entries, 0 to 4613
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   Time               4614 non-null   datetime64[ns]
 1   weekday            4614 non-null   int64         
 2   price              4614 non-null   float64       
 3   percent_change_24  4614 non-null   float64       
 4   percent_change_48  4614 non-null   float64       
 5   percent_change_72  4614 non-null   float64       
 6   percent_change_96  4614 non-null   float64       
dtypes: datetime64[ns](1), float64(5), int64(1)
memory usage: 252.5 KB


In [18]:
df = df.iloc[:4614,:]

In [19]:
import numpy as np
bins = [-(np.inf), -3, 3, np.inf]
names = ['sell', 'hold', 'buy']
df['action'] = pd.cut(df['percent_change_96'], bins, labels=names)
df = df.reset_index()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [20]:
df

Unnamed: 0,index,Time,weekday,price,percent_change_24,percent_change_48,percent_change_72,percent_change_96,action
0,0,2020-10-02 09:30:00,4,529.000000,-1.232518,-1.232518,0.083176,4.574671,buy
1,1,2020-10-02 10:30:00,4,539.695007,-3.189770,-3.189770,0.462297,4.352456,buy
2,2,2020-10-02 11:30:00,4,533.020020,-1.977419,-1.977419,1.378930,4.202462,buy
3,3,2020-10-02 12:30:00,4,531.131287,-1.628845,-1.628845,2.396526,5.256460,buy
4,4,2020-10-02 13:30:00,4,529.250000,-1.279172,-1.279172,2.638645,5.963155,buy
...,...,...,...,...,...,...,...,...,...
4609,4609,2021-04-12 10:30:00,0,569.349976,9.229832,8.237468,12.364982,12.159488,buy
4610,4610,2021-04-12 11:30:00,0,571.909973,8.322120,8.345780,11.392532,11.395158,buy
4611,4611,2021-04-12 12:30:00,0,585.020020,5.828835,6.247668,9.258257,9.483433,buy
4612,4612,2021-04-12 13:30:00,0,597.950012,4.296540,3.923398,7.547454,6.724636,buy


In [21]:
df.to_csv("test_response.csv")

In [None]:
a.tail()

In [None]:
nvda = download_stock_data('NVDA')
nvda.head()

In [None]:
nvda.head(60)

In [None]:
time_price = dict(zip(df_time.Time,df_time.price))

In [None]:
future_price = []
for i, row in nvda.iterrows():
    key = row['Time']+timedelta(hours=24)
    day = row['weekday']
    if key.weekday()>4:
        key += timedelta(hours=48)
    if key in time_price:
        future_price.append(time_price[key])
    else:
        future_price.append(None)

In [22]:
import pandas as pd
import numpy as np
import yfinance as yf
msft = yf.download('MSFT', start="2020-10-02", 
                   end="2021-04-13",interval='60m')
amd = yf.download('AMD', start="2020-10-02", 
                   end="2021-04-13",interval='60m')
ndaq = yf.download('NDAQ', start="2020-10-02", 
                   end="2021-04-13",interval='60m')
intel = yf.download('INTC', start="2020-10-02", 
                   end="2021-04-13",interval='60m')
qualcomm = yf.download('QCOM', start="2020-10-02", 
                   end="2021-04-13",interval='60m')
apple = yf.download('AAPL', start="2020-10-02", 
                   end="2021-04-13",interval='60m')

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


In [23]:
msft

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-10-02 09:30:00-04:00,208.000000,210.990005,207.990005,210.589996,210.589996,6766464
2020-10-02 10:30:00-04:00,210.570007,210.570007,208.350006,208.429993,208.429993,3446914
2020-10-02 11:30:00-04:00,208.419998,208.419998,205.539993,207.649994,207.649994,5986975
2020-10-02 12:30:00-04:00,207.649994,208.020004,206.470001,207.570007,207.570007,3219125
2020-10-02 13:30:00-04:00,207.559998,208.262695,206.589996,207.830002,207.830002,2662164
...,...,...,...,...,...,...
2021-04-12 11:30:00-04:00,256.600006,257.489990,256.220001,257.230011,257.230011,2333119
2021-04-12 12:30:00-04:00,257.220001,257.670013,257.040009,257.309998,257.309998,1823560
2021-04-12 13:30:00-04:00,257.299988,257.390015,256.279999,256.404999,256.404999,3198200
2021-04-12 14:30:00-04:00,256.380005,256.459991,255.550003,255.690002,255.690002,2303481


In [24]:
data.drop(['Datetime','High','Low','Adj Close', 'Volume'],axis=1,inplace=True)
cols = ['Time','Open','Close']
data=data[cols]

NameError: name 'data' is not defined

In [None]:
frame = [msft,amd.iloc[:,2],ndaq.iloc[:,2],intel.iloc[:,2],
         qualcomm.iloc[:,2],apple.iloc[:,2]]
stock_df = pd.concat(frame,axis=1)
stock_df.columns = ['Time','weekday','msft','amd','ndaq','intel',
                    'qualcomm','apple']

In [25]:
msft = download_stock_data('MSFT')
amd = download_stock_data('AMD')
ndaq = download_stock_data('NDAQ')
intel = download_stock_data('INTC')
qualcomm = download_stock_data('QCOM')
apple = download_stock_data('AAPL')

TypeError: download_stock_data() missing 2 required positional arguments: 'lookforward' and 'end_date'

In [None]:
download_stock_data('MSFT',0,'2021-04-02')

In [26]:
msft = download_stock_data('MSFT',0,'2021-04-13').iloc[:,[0,2]]
amd = download_stock_data('AMD',0,'2021-04-13').iloc[:,[0,2]]
ndaq = download_stock_data('NDAQ',0,'2021-04-13').iloc[:,[0,2]]
intel = download_stock_data('INTC',0,'2021-04-13').iloc[:,[0,2]]
qualcomm = download_stock_data('QCOM',0,'2021-04-13').iloc[:,[0,2]]
apple = download_stock_data('AAPL',0,'2021-04-13').iloc[:,[0,2]]

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


In [None]:
msft

In [27]:
frame = [msft.iloc[:,:2],amd.iloc[:,1],ndaq.iloc[:,1],intel.iloc[:,1],
         qualcomm.iloc[:,1],apple.iloc[:,1]]
stock_df = pd.concat(frame,axis=1)

In [28]:
stock_df.columns = ['Time','msft','amd','ndaq','intel',
                    'qualcomm','apple']

In [29]:
stock_df

Unnamed: 0,Time,msft,amd,ndaq,intel,qualcomm,apple
0,2020-10-02 09:30:00,208.000000,82.580002,122.660004,51.500000,116.779999,112.889999
1,2020-10-02 10:30:00,210.570007,84.500000,123.910004,51.880001,118.599998,115.188004
2,2020-10-02 11:30:00,208.419998,83.180000,124.419998,51.634998,117.800003,114.430000
3,2020-10-02 12:30:00,207.649994,82.739998,123.364998,51.610001,116.919998,114.199997
4,2020-10-02 13:30:00,207.559998,82.430000,123.959999,51.560001,116.550003,113.900002
...,...,...,...,...,...,...,...
4610,2021-04-12 11:30:00,256.600006,81.720001,156.535004,68.059998,138.500000,131.210007
4611,2021-04-12 12:30:00,257.220001,80.900002,156.759995,65.510002,138.389999,131.270096
4612,2021-04-12 13:30:00,257.299988,79.699997,156.149994,65.355499,138.190002,131.789993
4613,2021-04-12 14:30:00,256.380005,78.550102,156.110001,64.910004,136.820007,131.535004


In [30]:
stock_df.to_csv("stock_price_APR12.csv")

In [None]:
import pandas as pd
import numpy as np
import yfinance as yf

In [None]:
nvda = create_response("2020-10-02", "2021-03-19","NVDA")

In [None]:
import yfinance as yf
data = yf.download("MSFT", start="2020-10-02", end="2021-03-23",interval='60m')

In [None]:
data

In [None]:
data = data.reset_index()

In [None]:
data['Datetime'] = data['Datetime'].astype('str')

In [None]:
data.info()

In [None]:
import pandas as pd
time = []
for i, row in data.iterrows():
    time.append(row['Datetime'][:-6])

In [None]:
data['Time']=time

In [None]:
data.head(10)

In [None]:
data['Time'] = pd.to_datetime(data['Time'])

In [None]:
data.drop(['Datetime','High','Low','Adj Close', 'Volume'],axis=1,inplace=True)

In [None]:
cols = ['Time','Open','Close']
data=data[cols]

In [None]:
data.set_index('Time')

In [None]:
time_open = dict(zip(data.Time,data.Open))
time_close = dict(zip(data.Time,data.Close))

In [None]:
Time_complete = [data.Time[0]]
start = data.Time[0]
end = data.Time[812]
from datetime import datetime, timedelta
while start < end:
    ele = start + timedelta(hours=1)
    Time_complete.append(ele)
    start = ele

In [None]:
df_time = pd.DataFrame(Time_complete,columns=['Time'])

In [None]:
weekday_lst = []
for i,row in df_time.iterrows():
    cur = row['Time']
    weekday_lst.append(cur.weekday())

In [None]:
df_time['weekday'] = weekday_lst

In [None]:
df_time

In [None]:
price = []
from datetime import datetime, timedelta
for i,row in df_time.iterrows():
    cur = row['Time']
    day = row['weekday']
    if cur in time_open:
        price.append(time_open[cur])
    elif ("16:30:00" in str(cur) ) :
        cur -= timedelta(hours=1)
        if cur in time_close:
            price.append(time_close[cur])
        else:
            price.append(None)
    else:
        price.append(None)

In [None]:
df_time['price'] = price

In [None]:
df_time = df_time.fillna(method='ffill')

In [None]:
df_time.head(30)

In [None]:
time_price = dict(zip(df_time.Time,df_time.price))

In [None]:
future_price = []
for i, row in df_time.iterrows():
    key = row['Time']+timedelta(hours=24)
    day = row['weekday']
    if key.weekday()>4:
        key += timedelta(hours=48)
    if key in time_price:
        future_price.append(time_price[key])
    else:
        future_price.append(None)

In [None]:
df_time['future_price'] = future_price

In [None]:
df_time.loc[0,:]

In [None]:
df_time.loc[72,:]

In [None]:
df_time.info()

In [None]:
df_time.iloc[4038,:]

In [None]:
df_time = df_time.iloc[:4039,:]

In [None]:
df_time

In [None]:
df_time.info()

In [None]:
df_time['percent_change'] = ((df_time['future_price'] / df_time['price'])-1)*100

In [None]:
df_time

In [None]:
df_time.to_csv("./NVDA_price_1h.csv")

In [None]:
import pandas as pd
df = pd.read_csv('NVDA_price_1h',index_col=0)
df

In [None]:
df['Time'] = pd.to_datetime(df['Time'])

In [None]:
df.info()

In [None]:
df.head(15)

In [None]:
from datetime import datetime, timedelta, date
date(2020, 10, 2) in  df['Time']

In [None]:
df['Time'][0]

In [None]:
date(2020, 10, 2)

In [None]:
from datetime import datetime, timedelta
df['24_hour_later'] = df['Time'] + timedelta(hours=24)

In [None]:
df

In [None]:
time_price = dict(zip(df.Time,df.Open))

In [None]:
df.iloc[0,2]

In [None]:
df.iloc[0,2].weekday()

In [None]:
df.iloc[0,0]

In [None]:
df.iloc[0,0] in time_price[]

In [None]:
future_price = []
for i, row in df.iterrows():
    key = df.iloc[i,2]
    if key.weekday()>4:
        key += timedelta(hours=48)
    if key in time_price:
        future_price.append(time_price[key])
    else:
        future_price.append(None)

In [None]:
df.head()

In [None]:
df['future_price'] = future_price

In [None]:
df.head(30)

In [None]:
df.info()