Imports

In [48]:
import pandas as pd
import numpy as np


from sklearn.model_selection import train_test_split

#preprossors
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.impute import SimpleImputer

#models
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR


from sklearn.model_selection import RandomizedSearchCV

#Other
from os import path, getcwd
import sql_functions


Open file

In [49]:
# path is ../sql/db/__NAMEOFCSV__.csv
pathToParent = path.dirname(getcwd())
csvFile = pathToParent + "/sqlite/db/Fintech.csv"
completeDF = pd.read_csv(csvFile, names=['name','date', 'daybefore', 'dayafter', 'monthafter', 'threemonthsafter', 'transcript' ])



Create Test and trianing sets

In [50]:
trainDF, testDF = train_test_split(completeDF, test_size=.2, random_state=435)

print(trainDF)
trainDF = trainDF.dropna()
testDF = testDF.dropna()

trainX = trainDF.drop(columns=['name','dayafter', 'monthafter', 'threemonthsafter'])

long_y_train = trainDF['threemonthsafter']
medium_y_train = trainDF['monthafter']
short_y_train = trainDF['dayafter']

testX = testDF.drop(columns=['name','dayafter', 'monthafter', 'threemonthsafter'])

long_y_test = testDF['threemonthsafter']
medium_y_test = testDF['monthafter']
short_y_test = testDF['dayafter']


         name        date  daybefore  dayafter   monthafter  threemonthsafter  \
22      Apple  1525132800    162.130    175.23   184.973636        187.764844   
96      Intel  1651104000     45.480     44.99    43.597182         40.736687   
49  Microsoft  1580256000    163.780    174.05   179.093636        166.017656   
37      Apple  1643241600    163.500    165.71   169.241136        167.397109   
30      Apple  1588204800    284.730    286.25   308.689545        343.352086   
..        ...         ...        ...       ...          ...               ...   
84      Intel  1556150400     58.520     52.74    48.141364         47.541406   
12     Google  1612224000   1844.585   2065.61  2062.099800       2127.138712   
89      Intel  1595462400     61.310     52.15    49.715909         50.560133   
69        AMD  1595894400     69.280     75.50    81.110686         81.644009   
66        AMD  1572307200     33.160     32.93    37.167273         42.420000   

                           

Create Dummy models

In [51]:
shortDummy = DummyClassifier()
mediumDummy = DummyClassifier()
longDummy = DummyClassifier()

shortDummy.fit(trainX, short_y_train)
mediumDummy.fit(trainX, medium_y_train)
longDummy.fit(trainX, long_y_train)

print(shortDummy.predict(trainX))
print(mediumDummy.predict(testX))
print(longDummy.predict(testX))

[21.49 21.49 21.49 21.49 21.49 21.49 21.49 21.49 21.49 21.49 21.49 21.49
 21.49 21.49 21.49 21.49 21.49 21.49 21.49 21.49 21.49 21.49 21.49 21.49
 21.49 21.49 21.49 21.49 21.49 21.49 21.49 21.49 21.49 21.49 21.49 21.49
 21.49 21.49 21.49 21.49 21.49 21.49 21.49 21.49 21.49 21.49 21.49 21.49
 21.49 21.49 21.49 21.49 21.49 21.49 21.49 21.49 21.49 21.49 21.49 21.49
 21.49 21.49 21.49 21.49 21.49 21.49 21.49 21.49 21.49 21.49 21.49 21.49
 21.49 21.49 21.49 21.49 21.49 21.49 21.49 21.49]
[23.41909091 23.41909091 23.41909091 23.41909091 23.41909091 23.41909091
 23.41909091 23.41909091 23.41909091 23.41909091 23.41909091 23.41909091
 23.41909091 23.41909091 23.41909091 23.41909091 23.41909091 23.41909091
 23.41909091 23.41909091]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


Create preprocessing

In [52]:
trainX.transcript = trainX['transcript'].values.astype('U')
testX.transcript = testX['transcript'].values.astype('U')


In [53]:
preprocessor = ColumnTransformer([('transcript', CountVectorizer(stop_words="english", strip_accents='ascii'),'transcript'),
                                ('daybefore', 'passthrough', ['daybefore']),
                                ('date', "drop", 'date')])

Create pipelines

In [54]:
pipe = make_pipeline(
    preprocessor, 
    LinearRegression())

pipe.fit(trainX, short_y_train)

predictions = pipe.predict(testX)

for i in range(len(testX['daybefore'].array)):
    print(testX['daybefore'].array[i], predictions[i], short_y_test.array[i])


375.0 379.8797295403162 411.535
24.18 23.511833388127208 17.92
261.66 257.6807095839688 256.078
27.9 24.864648850165132 28.95
1115.0 1146.4553957000803 1048.33
74.23 97.03745413447157 78.47
58.27 56.977568362298996 56.2
55.25 30.691547037732438 50.39
52.21 15.986681217243834 48.76
51.86 41.77616837643939 55.44
85.655 90.28306706575125 98.18
213.85 205.56947055738672 207.67
48.41 34.9270369417153 46.85
34.14 17.03228641698895 32.08
61.04 79.67131277340478 66.57
55.18 36.47591641830203 54.58
986.27 987.1126955771502 1030.99
143.6 146.6633308641827 139.52
150.96 136.72143827416141 148.2
310.06 322.3008750906864 324.45


Creating a grid search to find best hyperperameters. 

In [55]:
param_dist = {"columntransformer__transcript__max_features" : range(100, 10000, 100)}

random_search = RandomizedSearchCV(
    pipe, param_dist, cv=4, n_jobs=-1, scoring= "roc_auc")

shortModel = random_search.fit(trainX, short_y_train)





In [56]:
print(shortModel.predict(testX))

print(short_y_test.array)


[ 379.89633428   23.66896605  257.61645836   24.82126382 1146.54210829
   97.35425085   56.89891301   30.47710193   15.44188942   41.67113777
   90.31455529  205.41849847   34.86986636   16.85989928   79.83165756
   36.18136789  987.14485226  146.68210548  136.60999325  322.48438559]
<PandasArray>
[411.535,   17.92, 256.078,   28.95, 1048.33,   78.47,    56.2,   50.39,
   48.76,   55.44,   98.18,  207.67,   46.85,   32.08,   66.57,   54.58,
 1030.99,  139.52,   148.2,  324.45]
Length: 20, dtype: float64


In [57]:
param_dist = {"columntransformer__transcript__max_features" : range(100, 10000, 100)}

random_search = RandomizedSearchCV(
    pipe, param_dist, cv=4, n_jobs=-1, scoring= "roc_auc")

mediumModel = random_search.fit(trainX, medium_y_train)



In [58]:
print(mediumModel.predict(testX))

print(medium_y_test.array)

[ 375.66716728   28.00009764  254.18470531   28.17869775 1124.03906393
   78.73554482   49.44028691   81.18414741   52.64240236   51.01830501
   89.68426249  204.57905221   11.52515748   17.37700058   40.05062273
   50.96752068  939.57260384  130.18527278  120.15185551  330.87475671]
<PandasArray>
[ 459.2465681818181, 19.712727272727275, 248.84054545454543,
  27.25909090909091,  1062.898181818182,  81.01350000000001,
  59.50045454545455,  50.26505909090909, 48.388181818181806,
  53.78409090909091,  94.75977272727272,  212.8215909090909,
  49.24727272727273, 31.009318181818177,  66.32863636363638,
  53.64429090909091, 1040.4272727272726, 133.64136363636365,
 147.00363636363633, 315.97636363636354]
Length: 20, dtype: float64


There are some entries where there are zero values for the three month prediction, I will drop all of those rows, since none of these stocks drop to zero. I think this happens if 3 months have'nt passed since the last earnings call.

In [59]:
longTrainX = trainX
longTrainX['threeMonthsAfter'] = long_y_train
print(trainX)
print(trainX.shape)
longTrainX = longTrainX[longTrainX['threeMonthsAfter'] != 0]
long_y_train = longTrainX['threeMonthsAfter']
longTrainX = longTrainX.drop(columns=["threeMonthsAfter"])

          date  daybefore                                         transcript  \
22  1525132800    162.130  Good day everyone and welcome to the Apple Inc...   
96  1651104000     45.480  Good day ladies and gentlemen thank you for st...   
49  1580256000    163.780  Welcome to the Microsoft Fiscal Year 2020 Seco...   
37  1643241600    163.500  Good day and welcome to the Apple Q1 FY 2022 e...   
30  1588204800    284.730  Good day everyone. Welcome to the Apple Inc. S...   
..         ...        ...                                                ...   
84  1556150400     58.520  Good day ladies and gentlemen and welcome to t...   
12  1612224000   1844.585  Welcome everyone and thank you for standing by...   
89  1595462400     61.310  Ladies and gentlemen thank you for standing by...   
69  1595894400     69.280  Hello and welcome to the AMD second-quarter 20...   
66  1572307200     33.160  Greetings and welcome to the Advanced Micro De...   

    threeMonthsAfter  
22        187.76

In [60]:
param_dist = {"columntransformer__transcript__max_features" : range(100, 10000, 100)}

random_search = RandomizedSearchCV(
    pipe, param_dist, cv=4, n_jobs=-1, scoring= "roc_auc")

longModel = random_search.fit(longTrainX, long_y_train)



In [61]:
print(longModel.predict(testX))

print(long_y_test.array)

[ 381.25165939   29.07578411  236.2537527    33.27853686 1105.96614158
   60.86516999   77.73495803   81.19255115   51.35323093   62.94603212
   78.19568718  203.65436989   -2.04945868    1.88473052   38.90052798
   40.45371052  929.81089135  147.80049434  115.09215832  351.81412666]
<PandasArray>
[234.55319531250004,  19.63421875000001,  260.1805859374999,
 30.051484375000005,          1060.8325,                0.0,
  60.16468749999998,  51.20227499999999,  47.19499999999999,
  53.24281250000001,       89.126171875,  215.9439062500001,
  52.80156250000002, 30.533984374999992,  58.67562500000001,
       53.787384375, 1065.9328124999995,       128.47078125,
 140.03862500000002,  282.3162499999999]
Length: 20, dtype: float64


Test different models

In [62]:
pipe = make_pipeline(
    preprocessor,
    SVR())
pipe.fit(trainX, short_y_train)
print(pipe.predict(testX))
print(short_y_test.array)

[152.4724875  145.4837891  149.8579085  146.1325234  169.42195064
 146.01967473 145.76486653 145.82714928 145.67121297 145.77380447
 146.20710028 148.88246113 145.71428283 145.48426231 145.78199318
 145.65042171 167.40551806 147.61191274 147.73872798 150.99736455]
<PandasArray>
[411.535,   17.92, 256.078,   28.95, 1048.33,   78.47,    56.2,   50.39,
   48.76,   55.44,   98.18,  207.67,   46.85,   32.08,   66.57,   54.58,
 1030.99,  139.52,   148.2,  324.45]
Length: 20, dtype: float64


Create predictions

Add predictions to database

In [63]:
completeDF.sort_values('date', ascending=False)
companyCount = completeDF['name'].nunique()

predictionDict = {}
rowIndex = 1
while len(predictionDict) < companyCount:
    row = completeDF.iloc[rowIndex].to_frame().T
    if row['name'].iloc[0] not in predictionDict:
        predictions = [shortModel.predict(row)[0], mediumModel.predict(row)[0], longModel.predict(row)[0]]
        predictionDict[row['name'].iloc[0]] = predictions
    
    rowIndex += 1



In [64]:
for key, value in predictionDict.items():
    print(key, value)
    sql_functions.add_prediction_short(key, value[0])
    sql_functions.add_prediction_medium(key, value[1])
    sql_functions.add_prediction_long(key, value[2])
    

Google [1127.419883466174, 1092.9122880205753, 1076.0840625000012]
Apple [159.10002073992877, 168.3079497849169, 170.81984374999976]
Microsoft [64.35503683904122, 64.1888529450504, 65.01851562499998]
AMD [23.668966053618757, 28.000097635220087, 29.075784105629253]
Intel [41.67113777256047, 51.01830500602344, 62.946032122230605]
