In [2]:
# regression is taking continous data and fitting a best fit line into it 
# linear regression -> fit a straight line to be specific

# in this project here we'll try predicting stock prices

# https://www.youtube.com/watch?v=JcI5Vnw0b2c&list=PLQVvvaa0QuDfKTOs3Keq_kaG2P55YRn5v&index=2
# https://www.youtube.com/watch?v=lN5jesocJjk&list=PLQVvvaa0QuDfKTOs3Keq_kaG2P55YRn5v&index=3

In [9]:
import pandas as pd
import quandl 
import math

df=quandl.get('WIKI/GOOGL',api_key='x1sxcyxyfLTV1Ws2tT7y')

print(df.head())


# api key is unique to you quandl account and acts an authorization


# here each coloumn is a  feature and in ML you can have all the features you want but they need to be meaningful
# in this case for pattern recognization , we don't need all the features
# from the o/p we can see that first we have open/high and all , and then we have their adjusted values
# and we don't need both


# adjusted values are the values adjusted after a stock split
# for example instead of having 10 shares of $1000 each you split it into 20 shares of $500 and that lures customers
# the values are then adjusted and we are going to use these adjusted values

# and even these various adjusted values are also quite related to each other

# therefore we don't need to use all of them
# in different algorithms and deep learning you might also need to (later on) find the relationships b/w these features
# but in regression  that's not to be done

# so basically in regression just use all the meaningful ones and remove others 
# useless features cause trouble in this algorithm (especially for the more simple ones in supervised learning)

              Open    High     Low    Close      Volume  Ex-Dividend  \
Date                                                                   
2004-08-19  100.01  104.06   95.96  100.335  44659000.0          0.0   
2004-08-20  101.01  109.08  100.50  108.310  22834300.0          0.0   
2004-08-23  110.76  113.48  109.05  109.400  18256100.0          0.0   
2004-08-24  111.24  111.60  103.57  104.870  15247300.0          0.0   
2004-08-25  104.76  108.00  103.88  106.000   9188600.0          0.0   

            Split Ratio  Adj. Open  Adj. High   Adj. Low  Adj. Close  \
Date                                                                   
2004-08-19          1.0  50.159839  52.191109  48.128568   50.322842   
2004-08-20          1.0  50.661387  54.708881  50.405597   54.322689   
2004-08-23          1.0  55.551482  56.915693  54.693835   54.869377   
2004-08-24          1.0  55.792225  55.972783  51.945350   52.597363   
2004-08-25          1.0  52.542193  54.167209  52.100830   53.1

In [10]:
# now let's choose the coloumns which we want to keep 


df=df[['Adj. Open','Adj. High','Adj. Low','Adj. Close','Adj. Volume']]


# now let's understand a little meaning and relations among the chosen features
# volume is the number of trades occurred in a day
# the high and low values tell about the volatality of the day 
# the open tells the price at which the market opened and the closing tells at what it closed 
# the relationship b/w open and close tells how much did the price go up/down
# technically volume has a relationship with volatality of the day but we'll keep it a little simple


# now this relationship is very valuable but simple linear regression does not identify this on its own
# so we need to define these special relationships and then use them as our features rather using all of these 

# finding the volatility % of the day =(high-low)/low*100%
df['HL_PCT']=(df['Adj. High']-df['Adj. Low'])/df['Adj. Low']*100.0

# now we will find the daily % change like the daily move
df['PCT_change']=(df['Adj. Close']-df['Adj. Open'])/df['Adj. Open']*100.0

# and now we will define the only coloumns we need
df=df[['Adj. Close','HL_PCT','PCT_change','Adj. Volume']]

print(df.head())

            Adj. Close    HL_PCT  PCT_change  Adj. Volume
Date                                                     
2004-08-19   50.322842  8.441017    0.324968   44659000.0
2004-08-20   54.322689  8.537313    7.227007   22834300.0
2004-08-23   54.869377  4.062357   -1.227880   18256100.0
2004-08-24   52.597363  7.753210   -5.726357   15247300.0
2004-08-25   53.164113  3.966115    1.183658    9188600.0


In [24]:
# now thinkind of our label (y (o/p) value) , now here adj.close can't be our label because hl-pct and pct-change themselves
# depend on that .. and so we have to know close value to know our other features to compute the label
# so for these features in our data set close is not a label

# in our case we will not use the whole close coloumn as a feature
# instead what we'll do is we'll take the last 10 values of close and use that as a feature (explained more later)

# hence for this dataset our label will the price in the future
# that is close value but not of this day but of a day in future
# and so we would need some more information to get a value predicted in future



forecast_col='Adj. Close'

# this forecast_col is just a variable 
# later on we would be able to change this variable to be a different forecast_col in the future

# now we will take care of missing data because in ML we can't work with empty data fields in b/w the dataset
# so we fill those with some specific data or you can also get rid of that coloumn but in ML we don't like deleting data

df.fillna(-9999, inplace=True)

# this filled data will be treated as an outlier to your data

In [25]:
# now we will forecast out because this is a regression model and that is what we generally do in a regression model

forecast_out=int(math.ceil(0.1*len(df)))

# int conversion is required because ceil() gives float value 
# and this constant 0.1 can be changed according to our needs

# these will be the number of days out
# this will predict 10% of our data frame
# you can get tomorrow's price and the next day price , but you will be using the data that came ten days ago to predict today

df['label']=df[forecast_col].shift(-forecast_out)

# now we are shifting the coloumns up
# this way each row, the label coloumn for each row will be adjusted close price 10 days into the feature

In [26]:
print(df.head())


# as we can see that adj.close and label coloumn have too different value 
# let's remove the -9999 values which we did insert before

df.dropna(inplace=True)
print(df.head())

# let's change the % 
forecast_out=int(math.ceil(0.01*len(df)))
df['label']=df[forecast_col].shift(-forecast_out)
print(df.head())

forecast_out=int(math.ceil(0.004*len(df)))
df['label']=df[forecast_col].shift(-forecast_out)
print(df.head())

            Adj. Close    HL_PCT  PCT_change  Adj. Volume       label
Date                                                                 
2004-08-19   50.322842  8.441017    0.324968   44659000.0  148.528493
2004-08-20   54.322689  8.537313    7.227007   22834300.0  152.972211
2004-08-23   54.869377  4.062357   -1.227880   18256100.0  152.109548
2004-08-24   52.597363  7.753210   -5.726357   15247300.0  154.827939
2004-08-25   53.164113  3.966115    1.183658    9188600.0  152.069424
            Adj. Close    HL_PCT  PCT_change  Adj. Volume       label
Date                                                                 
2004-08-19   50.322842  8.441017    0.324968   44659000.0  148.528493
2004-08-20   54.322689  8.537313    7.227007   22834300.0  152.972211
2004-08-23   54.869377  4.062357   -1.227880   18256100.0  152.109548
2004-08-24   52.597363  7.753210   -5.726357   15247300.0  154.827939
2004-08-25   53.164113  3.966115    1.183658    9188600.0  152.069424
            Adj. Clo

In [20]:
# SIGNIFICANCE OF INPLACE ARGUMENT




# When inplace=True is passed, the data is renamed in place (it returns nothing), so you'd use:

# df.an_operation(inplace=True)
# When inplace=False is passed (this is the default value, so isn't necessary), 
# performs the operation and returns a copy of the object, so you'd use:

# df = df.an_operation(inplace=False) 
# So:

# if inplace == False:
#     Assign your result to a new variable
# else
#     No need to assign 



# SIGNIFICANCE OF DROPNA()

# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.dropna.html

In [34]:
# breaking down the meaning of forecast_out

# df['label'] = df[forecast_col].shift(-forecast_out)

# This is how I understood sentdex logic, he is taking 0.01 or 1% of the length of all the rows within the dataframe. 
# Each row in the dataFrame is representative of a day in the life of the stock. So if the stock has been trading for 365 days, 
# there will be 365 rows in the dataFrame. 1% of 365 is 3.65 days which is then rounded up by the math.ceil function to 4 days. 
# The 4 days will be the forecast _out variable which is the variable
# that used to shift the Adj.Close price column in the dataFame up by 4. 
# In other words, if you were standing at day 1 of the stock when it was first traded, 
# the prediction or the 'label' from his algorithm would tell you that at day 4, 
# your stock will be valued at the amount of the close as taken on day 4 from actual data.
# This isn't totally useful info since you can look at the Adj.Close column on day 4 to get back to the label info on day 1
# This is really all done to build a training set so that the machine can learn from the trend.

# https://www.youtube.com/redirect?q=http%3A%2F%2Fstackoverflow.com%2Fquestions%2F20095673%2Fpython-shift-column-in-pandas-dataframe-up-by-one&stzid=UghvYRxkLh65LngCoAEC.8FQvlEXFJPd8HUnT96JMeL&redir_token=nOmKLnn8BepGCy12-uOoTdXIPQ58MTU1NjI2MzU4OEAxNTU2MTc3MTg4&event=comments