Imports

In [4]:
import pandas as pd
import numpy as np


from sklearn.model_selection import train_test_split

#preprossors
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.impute import SimpleImputer

#models
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR


from sklearn.model_selection import RandomizedSearchCV

#Other
from os import path, getcwd
import sql_functions

Open file

In [5]:
# path is ../sql/db/__NAMEOFCSV__.csv
pathToParent = path.dirname(getcwd())
csvFile = pathToParent + "/sqlite/db/Fintech.csv"
completeDF = pd.read_csv(csvFile, names=['name','date', 'daybefore', 'dayafter', 'monthafter', 'threemonthsafter', 'transcript' ])



Create Test and trianing sets

In [6]:
trainDF, testDF = train_test_split(completeDF, test_size=.2, random_state=435)

print(trainDF)
trainDF = trainDF.dropna()
testDF = testDF.dropna()

trainX = trainDF.drop(columns=['name','dayafter', 'monthafter', 'threemonthsafter'])

long_y_train = trainDF['threemonthsafter']
medium_y_train = trainDF['monthafter']
short_y_train = trainDF['dayafter']

testX = testDF.drop(columns=['name','dayafter', 'monthafter', 'threemonthsafter'])

long_y_test = testDF['threemonthsafter']
medium_y_test = testDF['monthafter']
short_y_test = testDF['dayafter']


         name        date  daybefore  dayafter   monthafter  threemonthsafter  \
22      Apple  1525132800    162.130    175.23   184.973636        187.764844   
96      Intel  1651104000     45.480     44.99    43.597182         40.736687   
49  Microsoft  1580256000    163.780    174.05   179.093636        166.017656   
37      Apple  1643241600    163.500    165.71   169.241136        167.397109   
30      Apple  1588204800    284.730    286.25   308.689545        343.352086   
..        ...         ...        ...       ...          ...               ...   
84      Intel  1556150400     58.520     52.74    48.141364         47.541406   
12     Google  1612224000   1844.585   2065.61  2062.099800       2127.138712   
89      Intel  1595462400     61.310     52.15    49.715909         50.560133   
69        AMD  1595894400     69.280     75.50    81.110686         81.644009   
66        AMD  1572307200     33.160     32.93    37.167273         42.420000   

                           

Create Dummy models

In [7]:
shortDummy = DummyClassifier()
mediumDummy = DummyClassifier()
longDummy = DummyClassifier()

shortDummy.fit(trainX, short_y_train)
mediumDummy.fit(trainX, medium_y_train)
longDummy.fit(trainX, long_y_train)

print(shortDummy.predict(trainX))
print(mediumDummy.predict(testX))
print(longDummy.predict(testX))

[21.49 21.49 21.49 21.49 21.49 21.49 21.49 21.49 21.49 21.49 21.49 21.49
 21.49 21.49 21.49 21.49 21.49 21.49 21.49 21.49 21.49 21.49 21.49 21.49
 21.49 21.49 21.49 21.49 21.49 21.49 21.49 21.49 21.49 21.49 21.49 21.49
 21.49 21.49 21.49 21.49 21.49 21.49 21.49 21.49 21.49 21.49 21.49 21.49
 21.49 21.49 21.49 21.49 21.49 21.49 21.49 21.49 21.49 21.49 21.49 21.49
 21.49 21.49 21.49 21.49 21.49 21.49 21.49 21.49 21.49 21.49 21.49 21.49
 21.49 21.49 21.49 21.49 21.49 21.49 21.49 21.49]
[23.41909091 23.41909091 23.41909091 23.41909091 23.41909091 23.41909091
 23.41909091 23.41909091 23.41909091 23.41909091 23.41909091 23.41909091
 23.41909091 23.41909091 23.41909091 23.41909091 23.41909091 23.41909091
 23.41909091 23.41909091]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


Create preprocessing

In [8]:
trainX.transcript = trainX['transcript'].values.astype('U')
testX.transcript = testX['transcript'].values.astype('U')


In [9]:
preprocessor = ColumnTransformer([('transcript', CountVectorizer(stop_words="english", strip_accents='ascii'),'transcript'),
                                ('daybefore', 'passthrough', ['daybefore']),
                                ('date', "drop", 'date')])
countvec = CountVectorizer(stop_words='english', strip_accents='ascii')
X_counts = countvec.fit_transform(trainX['transcript'])


Create pipelines

In [10]:
pipe = make_pipeline(
    preprocessor, 
    LinearRegression())

pipe.fit(trainX, short_y_train)

predictions = pipe.predict(testX)

for i in range(len(testX['daybefore'].array)):
    print(testX['daybefore'].array[i], predictions[i], short_y_test.array[i])


375.0 379.8797295403162 411.535
24.18 23.511833388127208 17.92
261.66 257.6807095839688 256.078
27.9 24.864648850165132 28.95
1115.0 1146.4553957000803 1048.33
74.23 97.03745413447157 78.47
58.27 56.977568362298996 56.2
55.25 30.691547037732438 50.39
52.21 15.986681217243834 48.76
51.86 41.77616837643939 55.44
85.655 90.28306706575125 98.18
213.85 205.56947055738672 207.67
48.41 34.9270369417153 46.85
34.14 17.03228641698895 32.08
61.04 79.67131277340478 66.57
55.18 36.47591641830203 54.58
986.27 987.1126955771502 1030.99
143.6 146.6633308641827 139.52
150.96 136.72143827416141 148.2
310.06 322.3008750906864 324.45


Creating a grid search to find best hyperperameters. 

In [11]:
param_dist = {"columntransformer__transcript__max_features" : range(100, 10000, 100)}

random_search = RandomizedSearchCV(
    pipe, param_dist, cv=4, n_jobs=-1, scoring= "roc_auc")

shortModel = random_search.fit(trainX, short_y_train)





In [12]:
print(shortModel.predict(testX))

print(short_y_test.array)


[ 379.88235651   23.62612394  257.62292476   24.82941666 1146.45551545
   97.30710897   56.89391847   30.47904677   15.53547135   41.69463088
   90.29759858  205.46511211   34.90073871   16.91281492   79.8192838
   36.24891806  987.24370102  146.69326706  136.6131929   322.45417151]
<PandasArray>
[411.535,   17.92, 256.078,   28.95, 1048.33,   78.47,    56.2,   50.39,
   48.76,   55.44,   98.18,  207.67,   46.85,   32.08,   66.57,   54.58,
 1030.99,  139.52,   148.2,  324.45]
Length: 20, dtype: float64


In [13]:
param_dist = {"columntransformer__transcript__max_features" : range(100, 10000, 100)}

random_search = RandomizedSearchCV(
    pipe, param_dist, cv=4, n_jobs=-1, scoring= "roc_auc")

mediumModel = random_search.fit(trainX, medium_y_train)



In [14]:
print(mediumModel.predict(testX))

print(medium_y_test.array)

[ 374.82747261   28.25174789  252.63193046   28.00056604 1121.94112269
   77.63754836   49.21138319   85.0435521    52.92111991   51.06239292
   89.12944363  203.58310436   10.7670878    15.57090921   38.78651019
   51.95085557  936.20407049  127.84234075  118.19958347  333.24207903]
<PandasArray>
[ 459.2465681818181, 19.712727272727275, 248.84054545454543,
  27.25909090909091,  1062.898181818182,  81.01350000000001,
  59.50045454545455,  50.26505909090909, 48.388181818181806,
  53.78409090909091,  94.75977272727272,  212.8215909090909,
  49.24727272727273, 31.009318181818177,  66.32863636363638,
  53.64429090909091, 1040.4272727272726, 133.64136363636365,
 147.00363636363633, 315.97636363636354]
Length: 20, dtype: float64


In [15]:
param_dist = {"columntransformer__transcript__max_features" : range(100, 10000, 100)}

random_search = RandomizedSearchCV(
    pipe, param_dist, cv=4, n_jobs=-1, scoring= "roc_auc")

longModel = random_search.fit(trainX, long_y_train)



In [16]:
print(longModel.predict(testX))

print(long_y_test.array)

[ 3.93787774e+02  3.50087503e+01  2.11011679e+02  3.39897017e+01
  1.11085975e+03  3.43603062e+01  7.16622457e+01  7.57516933e+01
  5.50345839e+01  6.74769813e+01  6.79436676e+01  1.89509604e+02
 -9.53159799e+00  3.63759096e-03  3.19222743e+01  2.23439114e+01
  9.27708710e+02  1.49832260e+02  9.07588367e+01  3.51052586e+02]
<PandasArray>
[234.55319531250004,  19.63421875000001,  260.1805859374999,
 30.051484375000005,          1060.8325,                0.0,
  60.16468749999998,  51.20227499999999,  47.19499999999999,
  53.24281250000001,       89.126171875,  215.9439062500001,
  52.80156250000002, 30.533984374999992,  58.67562500000001,
       53.787384375, 1065.9328124999995,       128.47078125,
 140.03862500000002,  282.3162499999999]
Length: 20, dtype: float64


Test different models

In [17]:
pipe = make_pipeline(
    preprocessor,
    SVR())
pipe.fit(trainX, short_y_train)
print(pipe.predict(testX))
print(short_y_test.array)

[152.4724875  145.4837891  149.8579085  146.1325234  169.42195064
 146.01967473 145.76486653 145.82714928 145.67121297 145.77380447
 146.20710028 148.88246113 145.71428283 145.48426231 145.78199318
 145.65042171 167.40551806 147.61191274 147.73872798 150.99736455]
<PandasArray>
[411.535,   17.92, 256.078,   28.95, 1048.33,   78.47,    56.2,   50.39,
   48.76,   55.44,   98.18,  207.67,   46.85,   32.08,   66.57,   54.58,
 1030.99,  139.52,   148.2,  324.45]
Length: 20, dtype: float64


Create predictions

Add predictions to database

In [18]:
completeDF.sort_values('date', ascending=False)
companyCount = completeDF['name'].nunique()

predictionDict = {}
rowIndex = 1
while len(predictionDict) < companyCount:
    row = completeDF.iloc[rowIndex].to_frame().T
    if row['name'].iloc[0] not in predictionDict:
        predictions = [shortModel.predict(row)[0], mediumModel.predict(row)[0], longModel.predict(row)[0]]
        predictionDict[row['name'].iloc[0]] = predictions
    
    rowIndex += 1



In [22]:
for key, value in predictionDict.items():
    print(key, value)
    sql_functions.add_prediction_short(key, value[0])
    sql_functions.add_prediction_medium(key, value[1])
    sql_functions.add_prediction_long(key, value[2])
    

Google [1127.4198764557289, 1092.912272727273, 1076.083964358762]
Apple [159.10004432122025, 168.30809523809413, 170.81934113113525]
Microsoft [64.3550633710329, 64.1888636363618, 65.01838691683986]
AMD [23.626123941795385, 28.251747894871457, 35.00875029565694]
Intel [41.694630877609185, 51.062392919944756, 67.47698129361858]
