# Time Series Machine Learning Part 1 Assignment

In [1]:
%reload_ext nb_black

<IPython.core.display.Javascript object>

In [27]:
import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.tree import DecisionTreeRegressor

from sklearn.ensemble import AdaBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

<IPython.core.display.Javascript object>

In [40]:
data = pd.read_csv("../data/AMZN_data.csv")

<IPython.core.display.Javascript object>

In [41]:
data["date "] = pd.to_datetime(data["date"])
data = data[["date", "close"]]

<IPython.core.display.Javascript object>

In [42]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1259 entries, 0 to 1258
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   date    1259 non-null   object 
 1   close   1259 non-null   float64
dtypes: float64(1), object(1)
memory usage: 19.8+ KB


<IPython.core.display.Javascript object>

In [43]:
history = 5
future = 5

shifts = [x + future for x in list(range(1, history + 1))]

for shift in shifts:
    data["t-" + str(shift)] = data["close"].shift(shift)
data.dropna(inplace=True)

<IPython.core.display.Javascript object>

In [44]:
data.head()

Unnamed: 0,date,close,t-6,t-7,t-8,t-9,t-10
10,2013-02-25,259.87,269.24,269.47,258.7,257.21,261.95
11,2013-02-26,259.36,265.09,269.24,269.47,258.7,257.21
12,2013-02-27,263.25,269.75,265.09,269.24,269.47,258.7
13,2013-02-28,264.27,266.41,269.75,265.09,269.24,269.47
14,2013-03-01,265.74,265.94,266.41,269.75,265.09,269.24


<IPython.core.display.Javascript object>

In [45]:
X = data.drop(["date", "close"], axis=1)
y = data["close"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, shuffle=False
)

<IPython.core.display.Javascript object>

In [46]:
model = DecisionTreeRegressor()
model.fit(X_train, y_train)
predictions = model.predict(X_test)

<IPython.core.display.Javascript object>

In [47]:
print('R-squared: ', r2_score(y_test, predictions))
print('MAE: ', mean_absolute_error(y_test, predictions))
print('MSE: ', np.sqrt(mean_squared_error(y_test, predictions)))


R-squared:  -2.1159769897196683
MAE:  205.40820000000005
MSE:  248.9558752028158


<IPython.core.display.Javascript object>

In [48]:
def iscatter(df, x, y, color=None, size=None, title=''):
    fig = px.scatter(df, x=x, y=y, color=color, size=size, 
                     title=title, template='none')
    
    fig.update_traces(marker_line_color='black', 
                  marker_line_width=1)

    fig.show()


<IPython.core.display.Javascript object>

In [49]:
results = pd.DataFrame([list(y_test), predictions], index=["Observed", "Predicted"]).T
iscatter(results, "Observed", "Predicted")

<IPython.core.display.Javascript object>

### Import the Netflix stock price data set (NFLX_data.csv).

In [61]:
data = pd.read_csv("../data/NFLX_data.csv")

<IPython.core.display.Javascript object>

In [62]:
data["date"] = pd.to_datetime(data["date"])
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1259 entries, 0 to 1258
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   date    1259 non-null   datetime64[ns]
 1   open    1259 non-null   float64       
 2   high    1259 non-null   float64       
 3   low     1259 non-null   float64       
 4   close   1259 non-null   float64       
 5   volume  1259 non-null   int64         
 6   Name    1259 non-null   object        
dtypes: datetime64[ns](1), float64(4), int64(1), object(1)
memory usage: 69.0+ KB


<IPython.core.display.Javascript object>

In [63]:
data.head()

Unnamed: 0,date,open,high,low,close,volume,Name
0,2013-02-08,25.9635,26.28,25.7157,25.8528,25649820,NFLX
1,2013-02-11,25.5685,26.0071,24.9714,25.4128,29321782,NFLX
2,2013-02-12,25.8085,26.2228,25.1014,25.4214,34388802,NFLX
3,2013-02-13,25.8428,26.6285,25.6657,26.6098,40799094,NFLX
4,2013-02-14,26.7557,27.1214,26.3844,26.7714,31968685,NFLX


<IPython.core.display.Javascript object>

In [64]:
data = data[["date", "close"]]
data

Unnamed: 0,date,close
0,2013-02-08,25.8528
1,2013-02-11,25.4128
2,2013-02-12,25.4214
3,2013-02-13,26.6098
4,2013-02-14,26.7714
...,...,...
1254,2018-02-01,265.0700
1255,2018-02-02,267.4300
1256,2018-02-05,254.2600
1257,2018-02-06,265.7200


<IPython.core.display.Javascript object>

In [74]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1259 entries, 0 to 1258
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   date    1259 non-null   datetime64[ns]
 1   close   1259 non-null   float64       
dtypes: datetime64[ns](1), float64(1)
memory usage: 19.8 KB


<IPython.core.display.Javascript object>

In [81]:
history = 90
future = 30

past_shifts = [x for x in list(range(1, history + 1))]
future_shifts = [x for x in list(range(1, future))]

shifted = data.copy()
shifted.columns = ["date", "t+0"]

for shift in past_shifts:
    shifted["t-" + str(shift)] = shifted["t+0"].shift(shift)

for shift in future_shifts:
    shifted["t+" + str(shift)] = shifted["t+0"].shift(-shift)


shifted.dropna(inplace=True)
shifted.head()

Unnamed: 0,date,t+0,t-1,t-2,t-3,t-4,t-5,t-6,t-7,t-8,...,t+20,t+21,t+22,t+23,t+24,t+25,t+26,t+27,t+28,t+29
90,2013-06-19,33.1871,32.69,32.7471,30.57,30.77,29.6628,30.6371,31.5614,31.46,...,38.0585,37.7965,37.4228,35.7514,34.4714,35.2485,35.1871,34.9943,34.8228,34.9263
91,2013-06-20,31.9314,33.1871,32.69,32.7471,30.57,30.77,29.6628,30.6371,31.5614,...,37.7965,37.4228,35.7514,34.4714,35.2485,35.1871,34.9943,34.8228,34.9263,35.5885
92,2013-06-21,30.9857,31.9314,33.1871,32.69,32.7471,30.57,30.77,29.6628,30.6371,...,37.4228,35.7514,34.4714,35.2485,35.1871,34.9943,34.8228,34.9263,35.5885,35.1685
93,2013-06-24,30.8,30.9857,31.9314,33.1871,32.69,32.7471,30.57,30.77,29.6628,...,35.7514,34.4714,35.2485,35.1871,34.9943,34.8228,34.9263,35.5885,35.1685,36.2628
94,2013-06-25,30.4143,30.8,30.9857,31.9314,33.1871,32.69,32.7471,30.57,30.77,...,34.4714,35.2485,35.1871,34.9943,34.8228,34.9263,35.5885,35.1685,36.2628,36.5571


<IPython.core.display.Javascript object>

In [83]:
X = shifted[["t-" + str(shift) for shift in past_shifts]]
future_shifts = [0] + future_shifts
# y = data["close"]

for future_shift in future_shifts:
    y = shifted["t+" + str(future_shift)]
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, shuffle=False
    )

    model = AdaBoostRegressor()
    model.fit(X_train, y_train)

    predictions = model.predict(X_test)
    print(
        "R_squared for ", "t+" + str(future_shift) + ":", r2_score(y_test, predictions)
    )

R_squared for  t+0: -2.1719781713827255
R_squared for  t+0: -2.2364861797911844
R_squared for  t+1: -2.2993163423680563
R_squared for  t+2: -3.0367462258155555
R_squared for  t+3: -3.133753566540893
R_squared for  t+4: -3.543631160549186
R_squared for  t+5: -2.807179657718316
R_squared for  t+6: -2.40974438691992
R_squared for  t+7: -3.6428979627271643
R_squared for  t+8: -2.4651862079557523
R_squared for  t+9: -3.4684646499603184
R_squared for  t+10: -2.4997685328087624
R_squared for  t+11: -2.6063781452071786
R_squared for  t+12: -2.9172147816040574
R_squared for  t+13: -3.062927257494292
R_squared for  t+14: -2.9046499331697877
R_squared for  t+15: -2.709925104796785
R_squared for  t+16: -2.852796943132433
R_squared for  t+17: -2.937087333245674
R_squared for  t+18: -2.728712561230609
R_squared for  t+19: -2.354087921112848
R_squared for  t+20: -2.518973499715781
R_squared for  t+21: -2.4829139629148025
R_squared for  t+22: -2.617697459764508
R_squared for  t+23: -2.34921676019335
R

<IPython.core.display.Javascript object>

### Transform the data by shifting the series and creating features that will allow us to forecast the price 30 days into the future from 90 days of daily history.

In [84]:
history = 90
future = 30

shifst = [x + future for x in list(range(1, history + 1))]

for shift in shifts:
    data["t-" + str(shift)] = data["close"].shift(shift)
data.dropna(inplace=True)
data

Unnamed: 0,date,close,t-31,t-32,t-33,t-34,t-35,t-36,t-37,t-38,...,t-111,t-112,t-113,t-114,t-115,t-116,t-117,t-118,t-119,t-120
120,2013-08-01,35.5885,32.6900,32.7471,30.5700,30.7700,29.6628,30.6371,31.5614,31.4600,...,25.6943,26.7357,26.7314,28.0643,27.0731,26.7714,26.6098,25.4214,25.4128,25.8528
121,2013-08-02,35.1685,33.1871,32.6900,32.7471,30.5700,30.7700,29.6628,30.6371,31.5614,...,25.6171,25.6943,26.7357,26.7314,28.0643,27.0731,26.7714,26.6098,25.4214,25.4128
122,2013-08-05,36.2628,31.9314,33.1871,32.6900,32.7471,30.5700,30.7700,29.6628,30.6371,...,26.2985,25.6171,25.6943,26.7357,26.7314,28.0643,27.0731,26.7714,26.6098,25.4214
123,2013-08-06,36.5571,30.9857,31.9314,33.1871,32.6900,32.7471,30.5700,30.7700,29.6628,...,26.3314,26.2985,25.6171,25.6943,26.7357,26.7314,28.0643,27.0731,26.7714,26.6098
124,2013-08-07,35.6014,30.8000,30.9857,31.9314,33.1871,32.6900,32.7471,30.5700,30.7700,...,26.8685,26.3314,26.2985,25.6171,25.6943,26.7357,26.7314,28.0643,27.0731,26.7714
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1254,2018-02-01,265.0700,190.1200,189.5600,187.8600,185.7300,186.2200,188.5400,185.2000,185.3000,...,169.0600,169.3400,166.7600,166.5400,166.0900,169.9800,168.5000,171.0000,171.4000,169.1400
1255,2018-02-02,267.4300,190.4200,190.1200,189.5600,187.8600,185.7300,186.2200,188.5400,185.2000,...,168.1300,169.0600,169.3400,166.7600,166.5400,166.0900,169.9800,168.5000,171.0000,171.4000
1256,2018-02-05,254.2600,187.0200,190.4200,190.1200,189.5600,187.8600,185.7300,186.2200,188.5400,...,165.9500,168.1300,169.0600,169.3400,166.7600,166.5400,166.0900,169.9800,168.5000,171.0000
1257,2018-02-06,265.7200,188.8200,187.0200,190.4200,190.1200,189.5600,187.8600,185.7300,186.2200,...,167.1200,165.9500,168.1300,169.0600,169.3400,166.7600,166.5400,166.0900,169.9800,168.5000


<IPython.core.display.Javascript object>

### Split the data into a training set and a testing set. Make the test set size 20%.

In [85]:
X = data.drop(["date", "close"], axis=1)
y = data["close"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

<IPython.core.display.Javascript object>

### Instantiate an AdaBoost model and fit it to the training set.

In [86]:
model = AdaBoostRegressor()
model.fit(X_train, y_train)

AdaBoostRegressor(base_estimator=None, learning_rate=1.0, loss='linear',
                  n_estimators=50, random_state=None)

<IPython.core.display.Javascript object>

### Generate predictions for the test set.

In [87]:
predictions = model.predict(X_test)


<IPython.core.display.Javascript object>

### Evaluate the results using R-Squared, Mean Absolute Error, and Root Mean Squared Error metrics.

In [None]:
print('R-squared:' + r2_score(y_test, predictions))
print('MAE:')

### Visually examine the results by creating a scatter plot where the x axis represents the observed results and the y axis represents the predictions.

In [60]:
results = pd.DataFrame([list(y_test), predictions], index=["Observed", "Predicted"]).T

iscatter(results, "Observed", "Predicted")

<IPython.core.display.Javascript object>