# Stock Analysis using Google Search Analytics

Should I buy or sell a specific stock today based off of how popular it is on google searches?

Group 1 <br>
3 August 2021 <br>
BAIS 6040: Data Programming in Python <br>

## Introduction

## Import

In [1]:
import yfinance as yf
import pandas as pd
import numpy as np
import os
import math
import plotly.express as px
import random as rnd
from datetime import date
from datetime import timedelta
from pytrends.request import TrendReq
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, recall_score, precision_score, fbeta_score, classification_report
from sklearn.metrics import explained_variance_score, mean_absolute_error, r2_score, mean_squared_error
from sklearn.model_selection import train_test_split    # For generating test/train
from sklearn.linear_model import LinearRegression   # Logistic regression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import Ridge
from sklearn.linear_model import LogisticRegression
%matplotlib inline

## Global Variables and Initialization

In [2]:
dataDir = r"./Data Files/"  #Directory of all data
today = date.today()  # Todays date
rnd.seed(1024)

## Global Functions

In [3]:
# Function gets stock data and trend data if needed
def get_data(ticker):
    if os.path.exists(f"{dataDir}{ticker}_{today}_year.csv"):
        #Get stored data
        stored_data = pd.read_csv(f"{dataDir}{ticker}_{today}_year.csv")
        # Get rid of index name
        stored_data.set_index('Unnamed: 0', inplace=True)
        stored_data.index.name = None
        return stored_data
    else:
        #Get new data
        # Connect to Google API
        pytrends = TrendReq(hl='en-US', tz=360)
        # Set Keyword
        kw_list = [ticker]
        # Google API only shows last 90 days so need to itirate
        # Set start of interval
        date90front = date.today()
        # Initiate dataframe
        trend_data = pd.DataFrame()
        for x in range(4):
            # Set start end of interval
            date90back = date90front - timedelta(days=90)
            # Build Payload of 90 days
            pytrends.build_payload(kw_list,
                                   timeframe=f'{date90back} {date90front}',
                                   geo='')
            trend_90 = pytrends.interest_over_time()
            trend_data = pd.concat([trend_90, trend_data])
            date90front = date90back
        # Get Stock Data
        stock_data = yf.download(ticker,
                                 start=date.today() - timedelta(days=360),
                                 end=date.today(), interval="1d")
        # Combine Data
        new_data = stock_data.join(trend_data)
        # Create index and add date column
        new_data.reset_index(inplace=True)
        new_data.rename(columns= {"index":"Date"}, inplace = True)
        
        # Rename search interest
        new_data.rename(columns = {ticker: "Search Interest"},inplace = True)
        # Add difference
        new_data["Price Difference"] = new_data["Close"] - new_data["Open"]
        # Add truth value that determines if we want to buy or not that day
        new_data['Buy'] = np.where(new_data['Price Difference'] > 0, 1, 0)
        # Delete isPartial
        del new_data['isPartial']
        # Delete Date
        del new_data['Date']
        # Remove NaN
        new_data.dropna(inplace=True)  
        # Export to data folder
        new_data.to_csv(f"{dataDir}{ticker}_{today}_year.csv")
        return new_data

# Function prints metrics of regression model
def PrintMetricsRegression(test, predictions):
    print(f"Score: {explained_variance_score(test, predictions):.2f}")
    print(f"MAE: {mean_absolute_error(test, predictions):.2f}")
    print(f"RMSE: {math.sqrt(mean_squared_error(test, predictions)):.2f}")
    print(f"r2: {r2_score(test, predictions):.2f}")

# Function prints metrics of classification model
def PrintMetricsClassiffication(test, predictions):
    print("Confusion Matrix:")
    print(confusion_matrix(test, predictions))
    print("------------------")
    print(f"Accuracy: {accuracy_score(test, predictions):.2f}")
    print(f"Recall: {recall_score(test, predictions):.2f}")
    print(f"Prediction: {precision_score(test, predictions):.2f}")
    print(f"f-measure: {fbeta_score(test, predictions, beta=1):.2f}")
    print("------------------")
    print(classification_report(test, predictions))


# Functions gets random data for predictions
def prepareDataForPredictions(X_df):
    numElements = 3
    random_df = []
    for _ in range(numElements):
        dict = {}
        for column in X_df.columns:
            min = 0  # assume min = 0
            maxValue = round(max(X_df[column].values))
            dict[column] = rnd.randint(min, maxValue)
        random_df.append(dict)
    return random_df

# Create categorical dummies
def createCategoricalDummies(dataFrame, categoryList):
    return pd.get_dummies(dataFrame[categoryList], prefix_sep = "::", drop_first = True)

## Stock to Analyze

In [4]:
# Gets Data for last year
StockSearch_df = get_data("AMC")
StockSearch_df

[*********************100%***********************]  1 of 1 completed


Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,Search Interest,Price Difference,Buy
0,3.850000,4.250000,3.840000,4.150000,4.150000,6226600,39.0,0.300000,1
1,4.070000,4.200000,3.950000,4.160000,4.160000,5489400,41.0,0.090000,1
2,4.120000,4.340000,4.060000,4.120000,4.120000,3699100,38.0,0.000000,0
3,4.080000,4.150000,3.950000,4.040000,4.040000,2584900,33.0,-0.040000,0
4,4.050000,4.200000,3.860000,4.110000,4.110000,4047100,36.0,0.060000,1
...,...,...,...,...,...,...,...,...,...
244,32.200001,37.400002,32.139999,36.000000,36.000000,199584500,22.0,3.799999,1
245,37.830002,38.549999,34.299999,34.959999,34.959999,126277200,24.0,-2.870003,0
246,32.950001,35.340000,31.150000,34.619999,34.619999,112891300,19.0,1.669998,1
247,35.139999,44.389999,35.130001,43.090000,43.090000,167634800,21.0,7.950001,1


## Analytics Model 1 w/ plots

In [5]:
featureColumns=['Search Interest', 'Open']
targetColumn = 'Close'

X=StockSearch_df[featureColumns]
y=StockSearch_df[targetColumn]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)

# Create the regression

In [6]:
lr = LinearRegression()
lr

LinearRegression()

# Fit Linear Model

In [7]:
lr.fit(X_train, y_train)

LinearRegression()

# Our Confidence in our model is high

In [8]:
lr.score(X_train, y_train) 

0.9770443146493532

In [9]:
lr.score(X_test, y_test) 

0.985552486080631

lr.score(X_test, y_test) 

# Print the prediction believed accuracy using the model

In [10]:
predictions = lr.predict(X_test)
PrintMetricsRegression(y_test, predictions)

Score: 0.99
MAE: 1.04
RMSE: 2.15
r2: 0.99


# Create new samples, to test our model

In [15]:
StockSearchPreparedData_df = prepareDataForPredictions(StockSearch_df)

# Prepare the predictions for consumption

In [16]:
StockSearchPrepared_df = pd.DataFrame.from_dict(StockSearchPreparedData_df)
StockSearchPrepared_df

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,Search Interest,Price Difference,Buy
0,6,46,44,42,0,796266430,38,5,0
1,10,49,18,20,11,491786746,74,3,1
2,43,71,1,52,2,46064592,31,24,0


# Predict what the close price will be

In [17]:
predictions = lr.predict(StockSearchPrepared_df)
predictions

ValueError: matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 2 is different from 9)

# Make it pretty

In [None]:
amcPredictedPrice = amcPreparedData.copy()
amcPredictedPrice['Price Prediction'] = predictions
amcPredictedPrice

## Analytics Model 2 w/ Plots

## Conclusion

You shoud {buy/sell} with this {##%} confidence