In [151]:
from pytrends.request import TrendReq
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
pytrends = TrendReq(hl='en-US', tz=360, timeout=(10,25))

In [32]:
def getdata(term):
	kw_list = [term]
	pytrends.build_payload(kw_list, cat=0, timeframe='today 5-y', geo='', gprop='')
	df = pytrends.interest_over_time()
	return df

In [4]:
def plotdata(term):
    df = getdata(term)
    df.plot()

In [5]:
stocks = pd.read_csv("~/desktop/daily_stocks.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [6]:
newstocks = stocks[stocks['date']>20150000] # Restrict to data after 2015

In [112]:
newstocks

Unnamed: 0,PERMNO,date,TICKER,BIDLO,ASKHI,PRC,BID,ASK,OPENPRC,RETX
2014,10026,20150102,JJSF,106.81000,109.44600,107.69000,107.45000,107.69000,109.18000,-0.009929
2015,10026,20150105,JJSF,106.76000,108.43000,107.24000,107.25000,107.39000,107.41000,-0.004179
2016,10026,20150106,JJSF,106.18500,108.69000,107.27000,107.25000,107.55000,107.84000,0.000280
2017,10026,20150107,JJSF,106.49500,108.77000,108.73000,108.62000,108.75000,107.70000,0.013611
2018,10026,20150108,JJSF,109.11000,111.28000,110.32000,110.31000,110.71000,109.48000,0.014623
...,...,...,...,...,...,...,...,...,...,...
10608559,93436,20181224,TSLA,295.19501,314.50000,295.39001,295.45999,295.67001,313.50000,-0.076242
10608560,93436,20181226,TSLA,294.09000,326.97000,326.09000,325.89999,326.07999,300.00000,0.103930
10608561,93436,20181227,TSLA,301.50000,322.17169,316.13000,315.92001,316.13000,319.84000,-0.030544
10608562,93436,20181228,TSLA,318.41000,336.23999,333.87000,333.87000,334.00000,323.10001,0.056116


In [8]:
def find_date_string(row):
    ds = str(row['date']-1)
    dates = ds[:4] + "-" + ds[4:6] + "-" + ds[6:]
    return dates

In [98]:
def getdf(searches, ticker):
    df = newstocks[newstocks["TICKER"]==ticker]
    df['date_string'] = df.apply(lambda row: find_date_string(row), axis=1)
    trends_data = getdata(searches[0])
    for i in range(1, len(searches)):
        s = searches[i]
        newtrends = getdata(s)
        trends_data = trends_data.join(newtrends, rsuffix='_'+s)
    print(trends_data)
    s = set(trends_data.index.values)
    s = set([str(x)[:10] for x in s])
    df = df[df['date_string'].isin(s)]
    #df.plot(y='BID')
    for s in searches:
        df[s] = df.apply(lambda row: trends_data.loc[row['date_string']][s], axis=1)
        #df.plot(y=s)
    #print(df)
    return df

In [320]:
from sklearn.linear_model import *
from sklearn.model_selection import train_test_split
from sklearn.svm import *
from sklearn.ensemble import *
from sklearn.neural_network import *
from sklearn import preprocessing

In [423]:
def regressions(ticker, pct):
    features = ["buy " + ticker, "sell " + ticker, ticker]
    data = getdf(features, ticker)
    data["diff"] = data.apply(lambda row: row["buy " + ticker] - row["sell " + ticker], axis = 1)
    returns = data["BID"].pct_change()
    returns = returns.fillna(0)
    #data["returns"] = data.apply(lambda row: row["BID"] - row.shift(1)["BID"], axis = 1)
    #print(data)
    #features.append("diff")
    #features.append("returns")
    X = pd.DataFrame(data, columns=features)
    X = preprocessing.scale(X)
    if pct:
        y = returns
    else:
        y = data["BID"]
    y = preprocessing.scale(y)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    
    #Linear regression
    lm = LinearRegression()
    m1 = lm.fit(X_train, y_train)
    
    #Ridge Regression, auto alpha
    lr = RidgeCV()
    m2 = lr.fit(X_train, y_train)
    
    #SVR 
    svr = SVR(gamma='scale', C=5.0, epsilon=0.1)
    m3 = svr.fit(X_train, y_train)
    
    #Adaboost regression
    adb = AdaBoostRegressor()
    m4 = adb.fit(X_train, y_train)
    
    #Neural Net
    mlp = MLPRegressor(max_iter=2000)
    m5 = mlp.fit(X_train, y_train)
    
    #Random forest
    rfr = RandomForestRegressor()
    m6 = rfr.fit(X_train, y_train)
    
    return lm.score(X_test, y_test), lr.score(X_test, y_test), svr.score(X_test, y_test), adb.score(X_test, y_test), mlp.score(X_test, y_test), rfr.score(X_test, y_test)




In [424]:
tickers = ["FB", "F", "GOOGL", "AMZN", "MSFT", "IBM", "BAC", "TSLA", "AAPL", "NFLX"]

print("(Linear regression, SVR, Adaboost, Neural Net, Random Forest regressor)")

for t in tickers:
    print(t)
    print(regressions(t, 0))

(Linear regression, SVR, Adaboost, Neural Net, Random Forest regressor)
FB
(0.6909857015308118, 0.6920123640966658, 0.8476721537596932, 0.8868828899966915, 0.8566630143008593, 0.8656667352127067)
F
(0.6684676458568077, 0.6694089645242661, 0.5846055288790759, 0.7211449737151722, 0.7011149218767747, 0.7019703413317468)
GOOGL
(0.8677583262166799, 0.8663677583016376, 0.8282741217427195, 0.8817916640865435, 0.877987073061088, 0.8764825757574333)
AMZN
(0.6408647787060604, 0.6398102664774701, 0.7694556792775773, 0.7966026828113594, 0.7405805538814377, 0.7968848010236347)
MSFT
(0.49318947968092053, 0.49298451285758593, 0.4765916158971264, 0.5672555042491894, 0.5307615236834544, 0.607195823136932)
IBM
(-0.01411832549087455, -0.0152453412762652, 0.0715470151275246, -0.27798757143503394, -0.34929886216465955, -0.4898427037371802)
BAC
(-0.01746487325435453, -0.01309600466135974, 0.16716697475153308, 0.34106127250981244, 0.1628636752828475, 0.3643806019008966)
TSLA
(0.1166257933773872, 0.1009315145

In [315]:
import warnings
warnings.filterwarnings('ignore')

In [425]:
def classify(ticker):
    features = ["buy " + ticker, "sell " + ticker, ticker]
    data = getdf(features, ticker)
    data["diff"] = data.apply(lambda row: row["buy " + ticker] - row["sell " + ticker], axis = 1)
    returns = data["BID"].pct_change()
    returns = returns.fillna(0)
    #data["returns"] = data.apply(lambda row: row["BID"] - row.shift(1)["BID"], axis = 1)
    #print(data)
    features.append("diff")
    #features.append("returns")
    X = pd.DataFrame(data, columns=features)
    X = preprocessing.scale(X)
    y = returns
    y = preprocessing.scale(y)
    y = np.sign(y)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)
    
    #Linear regression
    lr = LogisticRegression()
    m1 = lr.fit(X_train, y_train)
    
    #SVC 
    svc = SVC()
    m3 = svc.fit(X_train, y_train)
    
    #Adaboost classifier
    adb = AdaBoostClassifier(n_estimators=100, random_state=0)
    m4 = adb.fit(X_train, y_train)
    
    #Neural Net
    mlp = MLPClassifier(max_iter=2000)
    m5 = mlp.fit(X_train, y_train)
    
    #Random forest
    rfc = RandomForestClassifier()
    m6 = rfc.fit(X_train, y_train)
    
    return lr.score(X_test, y_test), svc.score(X_test, y_test), adb.score(X_test, y_test), mlp.score(X_test, y_test), rfc.score(X_test, y_test)



In [426]:
print("(Logistic regression, SVC, Adaboost, Neural Net, Random Forest)")
for t in tickers:
    print(t)
    print(classify(t))
    

(Logistic regression, SVC, Adaboost, Neural Net, Random Forest)
FB
(0.5135135135135135, 0.5135135135135135, 0.5675675675675675, 0.6216216216216216, 0.6216216216216216)
F
(0.5135135135135135, 0.5135135135135135, 0.4594594594594595, 0.40540540540540543, 0.5135135135135135)
GOOGL
(0.4864864864864865, 0.5135135135135135, 0.40540540540540543, 0.5945945945945946, 0.5675675675675675)
AMZN
(0.40540540540540543, 0.3783783783783784, 0.4594594594594595, 0.4594594594594595, 0.5135135135135135)
MSFT
(0.5135135135135135, 0.4864864864864865, 0.5135135135135135, 0.4594594594594595, 0.43243243243243246)
IBM
(0.5405405405405406, 0.5405405405405406, 0.43243243243243246, 0.5405405405405406, 0.3783783783783784)
BAC
(0.6486486486486487, 0.5675675675675675, 0.5405405405405406, 0.4594594594594595, 0.40540540540540543)
TSLA
(0.35135135135135137, 0.5135135135135135, 0.43243243243243246, 0.4594594594594595, 0.35135135135135137)
AAPL
(0.43243243243243246, 0.43243243243243246, 0.4594594594594595, 0.459459459459459

In [463]:
def profit(ticker):
    features = ["buy " + ticker, "sell " + ticker, ticker]
    data = getdf(features, ticker)
    data["diff"] = data.apply(lambda row: row["buy " + ticker] - row["sell " + ticker], axis = 1)
    #features.append("diff")
    X = pd.DataFrame(data, columns=features)
    #X = preprocessing.scale(X)
    y = data["BID"]
    #y = preprocessing.scale(y)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=True)
    
#     #Linear regression
#     lm = LinearRegression()
#     m1 = lm.fit(X_train, y_train)
    
#     #Ridge Regression, auto alpha
#     lr = RidgeCV()
#     m2 = lr.fit(X_train, y_train)
    
#     #SVR 
#     svr = SVR(gamma='scale', C=5.0, epsilon=0.1)
#     m3 = svr.fit(X_train, y_train)
    
#     #Adaboost regression
#     adb = AdaBoostRegressor()
#     m4 = adb.fit(X_train, y_train)
    
#     #Neural Net
#     mlp = MLPRegressor(max_iter=2000)
#     m5 = mlp.fit(X_train, y_train)
    #print(y)
    #Random forest
    rfr = RandomForestRegressor()
    m6 = rfr.fit(X_train, y_train)
    assert len(X_test)==len(y_test)
#     for i in range(len(X_test)):
#         print(y_test[i+1])
#         #print(str(rfr.predict([X_test[i]])) + " " + str(y_test[i]))
    earnings = 0
    numtrades = 0
    for index, row in X_test.iterrows():
        #print(index)
        #print(str(rfr.predict([row])) + " " + str(y_test[index]))
        buy = 0
        prediction = rfr.predict([row])
        i = 1
        while((index - i) not in y and i!= 100):
            i+=1
        if i == 100:
            break
        #print(index-i)
        if prediction > y[index-i]:
            buy = 1
        if buy == 1:
            earnings += y[index] - y[index - i]
        else:
            earnings += y[index-i] - y[index]
        numtrades += 1
    return earnings, numtrades
#     return lm.score(X_test, y_test), lr.score(X_test, y_test), svr.score(X_test, y_test), adb.score(X_test, y_test), mlp.score(X_test, y_test), rfr.score(X_test, y_test)




In [467]:
for t in tickers:
    print(t)
    print(profit(t))

FB


ResponseError: The request failed: Google returned a response with code 429.