In [None]:
#In this Project we attempt to perform two machine Learning tasks on this netflix stock data,
#First we will attempt to run a classification task to segment the data on buy/sell basis
#next we will attempt to a regression task to predict the Closing price of the Netflix stock data.

In [21]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error,mean_absolute_error
%matplotlib inline

In [2]:
#We get our Netflix Stock data
netflix =  pd.read_csv('C:/Users/user pc/Downloads/Netflix Stock Pred/D1/NFLX.US_D1.csv',index_col='datetime')
netflix.head()

Unnamed: 0_level_0,open,high,low,close,volume
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2002-05-23,17.21,17.4,16.04,16.8,6518800
2002-05-24,17.0,17.15,16.76,16.95,755900
2002-05-28,16.99,17.25,16.2,16.2,459900
2002-05-29,16.3,16.3,15.3,15.45,471700
2002-05-30,15.51,15.51,15.0,15.0,712800


In [3]:
netflix.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5508 entries, 2002-05-23 to 2024-02-07
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   open    5508 non-null   float64
 1   high    5508 non-null   float64
 2   low     5508 non-null   float64
 3   close   5508 non-null   float64
 4   volume  5508 non-null   int64  
dtypes: float64(4), int64(1)
memory usage: 258.2+ KB


In [6]:
#Here we create new features to our data
#it is the difference between the Opening stock price and the closing price
#We also have the difference between the high value and the low value
#these features will help in our classification, noting if a stock is good enough to buy or to sell

netflix['open - close'] = netflix['open'] - netflix['close']
netflix['high - low'] = netflix['high'] - netflix['low']
netflix.head()

Unnamed: 0_level_0,open,high,low,close,volume,open - close,high - low
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2002-05-23,17.21,17.4,16.04,16.8,6518800,0.41,1.36
2002-05-24,17.0,17.15,16.76,16.95,755900,0.05,0.39
2002-05-28,16.99,17.25,16.2,16.2,459900,0.79,1.05
2002-05-29,16.3,16.3,15.3,15.45,471700,0.85,1.0
2002-05-30,15.51,15.51,15.0,15.0,712800,0.51,0.51


In [58]:
#We use these newly genrated features as our input for our classification task
X = netflix[['open - close','high - low']]
X.head()

Unnamed: 0_level_0,open - close,high - low
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1
2002-05-23,0.41,1.36
2002-05-24,0.05,0.39
2002-05-28,0.79,1.05
2002-05-29,0.85,1.0
2002-05-30,0.51,0.51


In [10]:
#here we create a target variable Y for our classification task
#Y signifies +1 to buy a stock and -1 to sell the stock
#it calculate if the value(price) of the next day is greater than that of the current day it denotes it as +1
#meaning the customer should probably buy the stock
#else -1 meaning he/she should probably sell the stock

y = np.where(netflix['close'].shift(-1) > netflix['close'],1,-1)
y

array([ 1, -1, -1, ..., -1,  1, -1])

In [12]:
#Now we split our data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

In [27]:
#Our Classifier (KNN)
#We use grid search to search for the optimal K value

para = {'n_neighbors': [2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]}
knn = KNeighborsClassifier()
model = GridSearchCV(knn,para, cv=5)

model.fit(X_train,y_train)

In [28]:
#our Classifier prediction and accuracy score

#train accuracy
accuracy_train = accuracy_score(y_train, model.predict(X_train))

#test accuracy
accuracy_test = accuracy_score(y_test, model.predict(X_test))

print(f'train accuracy: {accuracy_train * 100:.2f}%')
print(f'test accuracy: {accuracy_test * 100:.2f}%')

train accuracy: 75.18%
test accuracy: 49.85%


In [29]:
#our model's prediction
prediction = model.predict(X_test)

#a dataframe to visualize our predicted values and our actual values
netflix_data = pd.DataFrame({'Actual Values': y_test, 'Predicted Values': prediction})
netflix_data.head(10)

#Note where the predicted value says we should either sell(-1) or buy(+1)
#the actual values based on the data determines what we should actually do

Unnamed: 0,Actual Values,Predicted Values
0,-1,-1
1,1,-1
2,1,1
3,-1,-1
4,-1,-1
5,-1,-1
6,1,-1
7,1,1
8,1,-1
9,1,-1


In [32]:
#example data
#10 random samples from the data
netflix_data.sample(10)
#note the models 50% accuracy

Unnamed: 0,Actual Values,Predicted Values
1307,-1,-1
1203,-1,-1
1231,-1,-1
76,1,-1
406,-1,-1
97,-1,-1
349,1,-1
374,1,-1
359,-1,1
1314,1,-1


In [33]:
#REGRESSION TASK

In [46]:
netflix.head()

Unnamed: 0_level_0,open,high,low,close,volume,open - close,high - low
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2002-05-23,17.21,17.4,16.04,16.8,6518800,0.41,1.36
2002-05-24,17.0,17.15,16.76,16.95,755900,0.05,0.39
2002-05-28,16.99,17.25,16.2,16.2,459900,0.79,1.05
2002-05-29,16.3,16.3,15.3,15.45,471700,0.85,1.0
2002-05-30,15.51,15.51,15.0,15.0,712800,0.51,0.51


In [59]:
X 

Unnamed: 0_level_0,open - close,high - low
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1
2002-05-23,0.41,1.36
2002-05-24,0.05,0.39
2002-05-28,0.79,1.05
2002-05-29,0.85,1.00
2002-05-30,0.51,0.51
...,...,...
2024-02-02,2.90,5.43
2024-02-03,-0.05,0.18
2024-02-05,1.16,18.55
2024-02-06,6.03,11.97


In [36]:
#For our regression task our target value becomes the price of the stock
y = netflix['close']
y

datetime
2002-05-23     16.80
2002-05-24     16.95
2002-05-28     16.20
2002-05-29     15.45
2002-05-30     15.00
               ...  
2024-02-02    564.56
2024-02-03    564.64
2024-02-05    561.78
2024-02-06    555.87
2024-02-07    559.09
Name: close, Length: 5508, dtype: float64

In [60]:
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X, y, test_size=0.3, random_state=101)

In [61]:
#scaling our data
scaler = StandardScaler()
X_train_reg = scaler.fit_transform(X_train_reg)
X_test_reg = scaler.transform(X_test_reg)

In [62]:
#our Regression Model
Reg_model = LinearRegression()

In [63]:
Reg_model.fit(X_train_reg,y_train_reg)

In [64]:
#regression model prediction
Reg_predictions = Reg_model.predict(X_test_reg)
Reg_predictions

array([ 85.48347692, 317.5039222 , 300.04690324, ..., 106.34563542,
       308.15765265, 305.53104192])

In [65]:
#Model Metrics
print(mean_squared_error(y_test_reg,Reg_predictions))

13664.60792233509


In [66]:
#RMSE
rms =np.sqrt(np.mean(np.power((np.array(y_test)-np.array(Reg_predictions)),2)))
rms

223.22075127753251

In [67]:
netflix_reg = pd.DataFrame({'Actual Close price': y_test_reg, 'Predicted Close Price': Reg_predictions})
netflix_reg.head(10)

Unnamed: 0_level_0,Actual Close price,Predicted Close Price
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1
2005-07-01,16.54,85.483477
2018-01-25,269.69,317.503922
2022-10-07,224.75,300.046903
2013-11-29,365.0,170.007496
2017-06-09,158.05,281.811685
2021-11-04,668.4,424.730232
2011-03-31,237.4,166.085431
2009-10-28,53.54,119.763517
2003-08-25,28.0,104.296595
2010-07-29,98.02,182.445994


In [68]:
netflix_reg.sample(10)

Unnamed: 0_level_0,Actual Close price,Predicted Close Price
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1
2012-04-30,80.17,132.074979
2009-09-25,46.3,96.563949
2010-09-29,170.63,295.681126
2011-11-04,90.02,172.183173
2012-05-22,67.69,166.825931
2011-02-10,222.93,170.326465
2012-12-07,85.96,158.056294
2021-11-23,654.0,450.249522
2005-10-14,28.65,105.25883
2023-03-08,311.82,198.729301
