# Predicting Intel Stock Price using Linear Regression

In [1]:
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn import model_selection
from sklearn import metrics

import matplotlib.pyplot as plt
%matplotlib inline

Get the latest Intel Stock Price from https://finance.yahoo.com/quote/intc/history/

In [2]:
df = pd.read_csv("data/INTC.csv")

In [None]:
Display the first 5 rows of your dataframe

In [3]:
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2018-09-10,46.560001,46.759998,46.23,46.299999,45.137581,21522400
1,2018-09-11,46.0,46.029999,44.889999,44.93,43.801979,32455300
2,2018-09-12,44.560001,44.98,44.060001,44.93,43.801979,29778700
3,2018-09-13,45.68,45.849998,45.110001,45.57,44.425911,18657100
4,2018-09-14,45.779999,45.970001,45.349998,45.540001,44.396664,22998700


Show some of the information about your data, e.g. how many rows, what data types

In [None]:
df.info()

Plotting the graph of Stock Prices against Date 

In [None]:
df.plot(x="Date",y=["Close","Open","High","Low","Adj Close"])

The various stock prices tend to follow the same trend, we will pick only 1 type of stock price to predict which is "Close", the closing price of the stock on a particular day.

Various stock prices and Volume data has different scales, if you want to visualize and plot the graph out, we will use matplotlib directly to plot

In [None]:
# Define Date as the X-axis and convert dataframe to a numpy array
print(type(df))
x=df["Date"].values
print(type(x))

In [None]:
# Print out the values and check the size
# print(x)
print(len(x))

In [None]:
# define the first y-axis which is all stock prices by dropping/removing the Date(x-axis) and Volume(data with different scale)
df2=df.drop(columns=["Date","Volume"])
df2.head()
y1=df2.values
# print(y1)
print(len(y1))

In [None]:
y2=df["Volume"].values
# print(y2)
print(len(y2))

In [None]:
fig=plt.figure(figsize=(12, 6))
ax1=fig.add_subplot(111)
ax1.plot(x,y1)
plt.title('Stock features against Date',fontsize=20)
ax1.set_xlabel('Date')
ax1.set_ylabel('Stock prices')
ax2 = ax1.twinx()
ax2.set_ylabel('Volume')
ax2.plot(x,y2,'c')


When creating the Linear Regression Model, "Date" is a sensitive factor
Date is a string and it can mean

In [None]:
df["Date"] = pd.to_datetime(df.Date,format='%Y-%m-%d')

In [None]:
df.info()

Scikit-learn will not accept String or Timestamp as the data, so we will need to convert the "Date" into a much simplier data that scikitlearn can accept

In [None]:
newdate = df["Date"]

df4=pd.DataFrame({"year": newdate.dt.year,
              "month": newdate.dt.month,
              "day": newdate.dt.day,
              "hour": newdate.dt.hour,
              "dayofyear": newdate.dt.dayofyear,
              "week": newdate.dt.week,
              "weekofyear": newdate.dt.weekofyear,
              "dayofweek": newdate.dt.dayofweek,
              "weekday": newdate.dt.weekday,
              "quarter": newdate.dt.quarter,
             })


In [None]:
df3=df.drop(columns=["Date"])
df5=pd.concat([df4,df3],axis=1)
df5.head()

Our data is now ready for model training.

In [None]:
df5.info()

In [None]:
# Suggest to split the data using this way since the data is a time series data
train=df5[:200]
test=df5[200:]

In [None]:
train.info()

In [None]:
test.info()

In [None]:
X_train=train.drop("Close",axis=1)
y_train=train["Close"]
X_test=test.drop("Close",axis=1)
y_test=test["Close"]

In [None]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train,y_train)

In [None]:
print(model.coef_)
print(len(model.coef_))

In [None]:
predictions=model.predict(X_test)

In [None]:
print('MAE:', metrics.mean_absolute_error(y_test, predictions))
print('MSE:', metrics.mean_squared_error(y_test, predictions))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictions)))

In [None]:
train=df5[:200]
test=df5[200:]

In [None]:
test.insert(16,"Predictions",predictions)

In [None]:
test[["Close","Predictions"]].tail()

In [None]:
fig=plt.figure(figsize=(14, 6))
plt.title("Stock Closing Price against Date",fontsize=20)
plt.xlabel("Date")
plt.ylabel("Stock Closing Price")
plt.plot(train["Close"])
plt.plot(test[["Close","Predictions"]])


In [None]:
model.score(X_test, y_test)

Seems like the model is doing a very good job at predicting the stock price. It almost seems too good to be true.
Let us try to build another model with the same dataset, but this time, let us only rely on the date as the feature.

In [None]:
df5.info()

In [None]:
train=df5[:200]
test=df5[200:]

In [None]:
X_train=train.drop(["Close","Open","High","Low","Adj Close","Volume"],axis=1)
y_train=train["Close"]
X_test=test.drop(["Close","Open","High","Low","Adj Close","Volume"],axis=1)
y_test=test["Close"]

In [None]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train,y_train)

In [None]:
print(model.coef_)
print(len(model.coef_))

In [None]:
predictions=model.predict(X_test)
print('MAE:', metrics.mean_absolute_error(y_test, predictions))
print('MSE:', metrics.mean_squared_error(y_test, predictions))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictions)))

In [None]:
test.insert(11,"Predictions",predictions)
test[["Close","Predictions"]].tail()

In [None]:
fig=plt.figure(figsize=(14, 6))
plt.title("Stock Closing Price against Date",fontsize=20)
plt.xlabel("Date")
plt.ylabel("Stock Closing Price")
plt.plot(train["Close"])
plt.plot(test[["Close","Predictions"]])

In [None]:
model.score(X_test, y_test)

As you can see, the prediction is very bad when you predict the stock price solely based on the date.