In [ ]:
from pytrends.request import TrendReq
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [ ]:
pytrends = TrendReq(hl='en-US', tz=360, timeout=(10,25))

In [ ]:
def getdata(term):
	kw_list = [term]
	pytrends.build_payload(kw_list, cat=0, timeframe='today 5-y', geo='', gprop='')
	df = pytrends.interest_over_time()
	return df

In [ ]:
def plotdata(term):
    df = getdata(term)
    df.plot()

In [ ]:
stocks = pd.read_csv("data/daily_stocks.csv")

In [ ]:
newstocks = stocks[stocks['date']>20150000] # Restrict to data after 2015

In [ ]:
newstocks

In [ ]:
def find_date_string(row):
    ds = str(row['date']-1)
    dates = ds[:4] + "-" + ds[4:6] + "-" + ds[6:]
    return dates

In [ ]:
def getdf(searches, ticker):
    df = newstocks[newstocks["TICKER"]==ticker]
    df['date_string'] = df.apply(lambda row: find_date_string(row), axis=1)
    trends_data = getdata(searches[0])
    for i in range(1, len(searches)):
        s = searches[i]
        newtrends = getdata(s)
        trends_data = trends_data.join(newtrends, rsuffix='_'+s)
    print(trends_data)
    s = set(trends_data.index.values)
    s = set([str(x)[:10] for x in s])
    df = df[df['date_string'].isin(s)]
    #df.plot(y='BID')
    for s in searches:
        df[s] = df.apply(lambda row: trends_data.loc[row['date_string']][s], axis=1)
        #df.plot(y=s)
    #print(df)
    return df

In [ ]:
from sklearn.linear_model import *
from sklearn.model_selection import train_test_split
from sklearn.svm import *
from sklearn.ensemble import *
from sklearn.neural_network import *
from sklearn import preprocessing

In [ ]:
def regressions(ticker, pct):
    features = ["buy " + ticker, "sell " + ticker, ticker]
    data = getdf(features, ticker)
    data["diff"] = data.apply(lambda row: row["buy " + ticker] - row["sell " + ticker], axis = 1)
    returns = data["BID"].pct_change()
    returns = returns.fillna(0)
    #data["returns"] = data.apply(lambda row: row["BID"] - row.shift(1)["BID"], axis = 1)
    #print(data)
    #features.append("diff")
    #features.append("returns")
    X = pd.DataFrame(data, columns=features)
    X = preprocessing.scale(X)
    if pct:
        y = returns
    else:
        y = data["BID"]
    y = preprocessing.scale(y)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    
    #Linear regression
    lm = LinearRegression()
    m1 = lm.fit(X_train, y_train)
    
    #Ridge Regression, auto alpha
    lr = RidgeCV()
    m2 = lr.fit(X_train, y_train)
    
    #SVR 
    svr = SVR(gamma='scale', C=5.0, epsilon=0.1)
    m3 = svr.fit(X_train, y_train)
    
    #Adaboost regression
    adb = AdaBoostRegressor()
    m4 = adb.fit(X_train, y_train)
    
    #Neural Net
    mlp = MLPRegressor(max_iter=2000)
    m5 = mlp.fit(X_train, y_train)
    
    #Random forest
    rfr = RandomForestRegressor()
    m6 = rfr.fit(X_train, y_train)
    
    return lm.score(X_test, y_test), lr.score(X_test, y_test), svr.score(X_test, y_test), adb.score(X_test, y_test), mlp.score(X_test, y_test), rfr.score(X_test, y_test)




In [ ]:
tickers = ["FB", "F", "GOOGL", "AMZN", "MSFT", "IBM", "BAC", "TSLA", "AAPL", "NFLX"]

print("(Linear regression, SVR, Adaboost, Neural Net, Random Forest regressor)")

for t in tickers:
    print(t)
    print(regressions(t, 0))

In [ ]:
import warnings
warnings.filterwarnings('ignore')

In [ ]:
def classify(ticker):
    features = ["buy " + ticker, "sell " + ticker, ticker]
    data = getdf(features, ticker)
    data["diff"] = data.apply(lambda row: row["buy " + ticker] - row["sell " + ticker], axis = 1)
    returns = data["BID"].pct_change()
    returns = returns.fillna(0)
    #data["returns"] = data.apply(lambda row: row["BID"] - row.shift(1)["BID"], axis = 1)
    #print(data)
    features.append("diff")
    #features.append("returns")
    X = pd.DataFrame(data, columns=features)
    X = preprocessing.scale(X)
    y = returns
    y = preprocessing.scale(y)
    y = np.sign(y)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)
    
    #Linear regression
    lr = LogisticRegression()
    m1 = lr.fit(X_train, y_train)
    
    #SVC 
    svc = SVC()
    m3 = svc.fit(X_train, y_train)
    
    #Adaboost classifier
    adb = AdaBoostClassifier(n_estimators=100, random_state=0)
    m4 = adb.fit(X_train, y_train)
    
    #Neural Net
    mlp = MLPClassifier(max_iter=2000)
    m5 = mlp.fit(X_train, y_train)
    
    #Random forest
    rfc = RandomForestClassifier()
    m6 = rfc.fit(X_train, y_train)
    
    return lr.score(X_test, y_test), svc.score(X_test, y_test), adb.score(X_test, y_test), mlp.score(X_test, y_test), rfc.score(X_test, y_test)



In [ ]:
print("(Logistic regression, SVC, Adaboost, Neural Net, Random Forest)")
for t in tickers:
    print(t)
    print(classify(t))
    

In [ ]:
def profit(ticker):
    features = ["buy " + ticker, "sell " + ticker, ticker]
    data = getdf(features, ticker)
    data["diff"] = data.apply(lambda row: row["buy " + ticker] - row["sell " + ticker], axis = 1)
    #features.append("diff")
    X = pd.DataFrame(data, columns=features)
    #X = preprocessing.scale(X)
    y = data["BID"]
    #y = preprocessing.scale(y)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=True)
    
#     #Linear regression
#     lm = LinearRegression()
#     m1 = lm.fit(X_train, y_train)
    
#     #Ridge Regression, auto alpha
#     lr = RidgeCV()
#     m2 = lr.fit(X_train, y_train)
    
#     #SVR 
#     svr = SVR(gamma='scale', C=5.0, epsilon=0.1)
#     m3 = svr.fit(X_train, y_train)
    
#     #Adaboost regression
#     adb = AdaBoostRegressor()
#     m4 = adb.fit(X_train, y_train)
    
#     #Neural Net
#     mlp = MLPRegressor(max_iter=2000)
#     m5 = mlp.fit(X_train, y_train)
    #print(y)
    #Random forest
    rfr = RandomForestRegressor()
    m6 = rfr.fit(X_train, y_train)
    assert len(X_test)==len(y_test)
#     for i in range(len(X_test)):
#         print(y_test[i+1])
#         #print(str(rfr.predict([X_test[i]])) + " " + str(y_test[i]))
    earnings = 0
    numtrades = 0
    for index, row in X_test.iterrows():
        #print(index)
        #print(str(rfr.predict([row])) + " " + str(y_test[index]))
        buy = 0
        prediction = rfr.predict([row])
        i = 1
        while((index - i) not in y and i!= 100):
            i+=1
        if i == 100:
            break
        #print(index-i)
        if prediction > y[index-i]:
            buy = 1
        if buy == 1:
            earnings += y[index] - y[index - i]
        else:
            earnings += y[index-i] - y[index]
        numtrades += 1
    return earnings, numtrades
#     return lm.score(X_test, y_test), lr.score(X_test, y_test), svr.score(X_test, y_test), adb.score(X_test, y_test), mlp.score(X_test, y_test), rfr.score(X_test, y_test)




In [ ]:
for t in tickers:
    print(t)
    print(profit(t))