Imports

In [149]:
import pandas as pd
import numpy as np


from sklearn.model_selection import train_test_split

#preprossors
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.impute import SimpleImputer

#models
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR



from sklearn.model_selection import RandomizedSearchCV

#Other
from os import path, getcwd

Open file

In [150]:
# path is ../sql/db/__NAMEOFCSV__.csv
pathToParent = path.dirname(getcwd())
csvFile = pathToParent + "/sqlite/db/Fintech.csv"
completeDF = pd.read_csv(csvFile, names=['date', 'daybefore', 'dayafter', 'monthafter', 'threemonthsafter', 'transcript' ])



Create Test and trianing sets

In [151]:
trainDF, testDF = train_test_split(completeDF, test_size=.2, random_state=435)

trainDF = trainDF.dropna()
testDF = testDF.dropna()

trainX = trainDF.drop(columns=['dayafter', 'monthafter', 'threemonthsafter'])

long_y_train = trainDF['threemonthsafter']
medium_y_train = trainDF['monthafter']
short_y_train = trainDF['dayafter']

testX = testDF.drop(columns=['dayafter', 'monthafter', 'threemonthsafter'])

long_y_test = testDF['threemonthsafter']
medium_y_test = testDF['monthafter']
short_y_test = testDF['dayafter']


Create Dummy models

In [152]:
shortDummy = DummyClassifier()
mediumDummy = DummyClassifier()
longDummy = DummyClassifier()

shortDummy.fit(trainX, short_y_train)
mediumDummy.fit(trainX, medium_y_train)
longDummy.fit(trainX, long_y_train)

print(shortDummy.predict(trainX))
print(mediumDummy.predict(testX))
print(longDummy.predict(testX))

[17.92 17.92 17.92 17.92 17.92 17.92 17.92 17.92 17.92 17.92 17.92 17.92
 17.92 17.92 17.92 17.92 17.92 17.92 17.92 17.92 17.92 17.92 17.92 17.92
 17.92 17.92 17.92 17.92 17.92 17.92 17.92 17.92 17.92 17.92 17.92 17.92
 17.92 17.92 17.92 17.92 17.92 17.92 17.92 17.92 17.92 17.92 17.92 17.92
 17.92 17.92 17.92 17.92 17.92 17.92 17.92 17.92 17.92 17.92 17.92 17.92
 17.92 17.92 17.92 17.92 17.92 17.92 17.92 17.92 17.92 17.92 17.92 17.92
 17.92 17.92 17.92 17.92 17.92 17.92 17.92 17.92 17.92 17.92 17.92 17.92
 17.92 17.92 17.92 17.92 17.92 17.92 17.92 17.92 17.92 17.92 17.92 17.92
 17.92 17.92 17.92 17.92 17.92 17.92 17.92 17.92 17.92 17.92 17.92 17.92
 17.92 17.92 17.92 17.92 17.92 17.92 17.92 17.92 17.92 17.92 17.92 17.92
 17.92 17.92 17.92 17.92 17.92 17.92 17.92 17.92 17.92 17.92 17.92 17.92
 17.92 17.92 17.92 17.92 17.92 17.92 17.92 17.92 17.92 17.92 17.92 17.92
 17.92 17.92 17.92 17.92 17.92 17.92 17.92 17.92 17.92 17.92 17.92 17.92
 17.92 17.92 17.92 17.92]
[19.71272727 19.71272727 

Create preprocessing

In [153]:
trainX.transcript = trainX['transcript'].values.astype('U')
testX.transcript = testX['transcript'].values.astype('U')

    



In [154]:
preprocessor = ColumnTransformer([('transcript', CountVectorizer(stop_words="english", strip_accents='ascii'),'transcript'),
                                ('daybefore','passthrough', ['daybefore']),
                                ('date', "drop", 'date')])
countvec = CountVectorizer(stop_words='english', strip_accents='ascii')
X_counts = countvec.fit_transform(trainX['transcript'])


Create pipelines

In [169]:
pipe = make_pipeline(
    preprocessor, 
    LinearRegression())

print(trainX)
pipe.fit(trainX, short_y_train)
print(testX['daybefore'].array)
print(pipe.predict(testX))
print(short_y_test.array)



           date  daybefore                                         transcript
122  1675296000      98.71  Welcome everyone. Thank you for standing by fo...
63   1540339200      24.18  Greetings and welcome to the AMD Third Quarter...
108  1564012800    1132.62  Good day ladies and gentlemen and welcome to t...
192  1603324800      53.14  Ladies and gentlemen thank you for standing by...
189  1579737600      61.04  Ladies and gentlemen thank you for standing by...
..          ...        ...                                                ...
91   1603324800      53.14  Ladies and gentlemen thank you for standing by...
68   1580169600      48.45  Greetings and welcome to the AMD fourth-quarte...
140  1651104000     155.91  Good day and welcome to the Apple Q2 FY 2022 e...
197  1643155200      51.01  Ladies and gentlemen thank you for standing by...
194  1619049600      62.79  Good day. Thank you for standing by. Welcome t...

[160 rows x 3 columns]
<PandasArray>
[    51.37,     204.4,    

Creating a grid search to find best hyperperameters. 

In [172]:
param_dist = {"columntransformer__transcript__max_features" : range(100, 10000, 100)}

random_search = RandomizedSearchCV(
    pipe, param_dist, cv=4, n_jobs=-1, scoring= "roc_auc")

shortModel = random_search.fit(trainX, short_y_train)





In [159]:
print(shortModel.predict(testX))

print(short_y_test.array)


[  54.18970662  209.87960768  103.80001781   52.7400392  1270.99999905
 1667.43984495  165.71012204  210.96117513  148.03017222  288.98980839
  139.39024525  148.19995956  129.89002078  127.82568984 1497.04997302
  324.44945043 3024.99992446   58.85001171  147.21491305  127.82568984
  307.98523706  282.10002917 2726.23871102   78.72995314   55.44014318
  411.53514717 1048.33020023   62.99987673  109.26000539  209.54992897
  210.96117513  161.84000856  286.25048486   78.46989485   75.49994674
 1275.99998104  136.70814311   21.49003785  136.70814311   98.18017652]
87       54.1900
128     209.8800
147     103.8000
85       52.7400
3      1271.0000
12     1667.4400
139     165.7100
154     207.6700
42      148.0300
56      288.9900
49      139.3900
142     148.2000
177     129.8900
176     121.6300
11     1497.0500
30      324.4500
118    3025.0000
92       58.8500
138     147.2150
75      121.6300
159     307.9850
59      282.1000
116    2726.2390
71       78.7300
182      55.4400
133   

In [170]:
param_dist = {"columntransformer__transcript__max_features" : range(100, 10000, 100)}

random_search = RandomizedSearchCV(
    pipe, param_dist, cv=4, n_jobs=-1, scoring= "roc_auc")

mediumModel = random_search.fit(trainX, medium_y_train)



In [171]:
print(mediumModel.predict(testX))

print(medium_y_test.array)

[  57.07492305  192.20186663  107.22314925   48.14134319 1244.68498153
 1726.30772386  169.24106453  203.81032341  149.99015927  291.07617006
  145.16982674  147.0033676   119.4318843   115.84449487 1531.35396244
  315.97630217 2736.46218717   58.48872767  152.85929257  115.84449487
  300.33334307  269.6454986  2731.9808749    81.58293418   53.78430467
  459.24621163 1062.89799417   69.66395365  116.33226714  191.05242277
  203.81032341  149.40114296  308.68985569   81.01334457   81.11078792
 1294.07722588  150.99525008   23.41909001  150.99525008   94.75984651]
<PandasArray>
[            57.075,  192.2022727272727, 107.22318181818184,
  48.14136363636363, 1244.6850000000002, 1726.3072863636364,
 169.24113636363634,  212.8215909090909,  149.9903181818182,
  291.0762954545455, 145.16954545454544, 147.00363636363633,
 119.43181818181816,  139.6440909090909, 1531.3538409090909,
 315.97636363636354,  2736.461877272728, 58.488740909090914,
  152.8593181818182,  139.6440909090909, 300.333318

In [173]:
param_dist = {"columntransformer__transcript__max_features" : range(100, 10000, 100)}

random_search = RandomizedSearchCV(
    pipe, param_dist, cv=4, n_jobs=-1, scoring= "roc_auc")

longModel = random_search.fit(trainX, long_y_train)



In [174]:
print(longModel.predict(testX))

print(long_y_test.array)

[ 5.80744640e+01  1.95574526e+02  1.14714363e+02  4.75415009e+01
  1.20514394e+03  1.75730917e+03  1.67397063e+02  2.04961273e+02
  1.79393927e-05  2.95644942e+02  1.52421961e+02  1.40037928e+02
  1.09747385e+02  9.18589664e+01  1.53002959e+03  2.82316087e+02
  2.67446449e+03  6.18429580e+01  1.65350827e+02  9.18589664e+01
  2.96104160e+02  2.63342605e+02  2.78719934e+03  8.87770982e+01
  5.32427177e+01  2.34553209e+02  1.06083273e+03  6.90197480e+01
  1.07613563e+02  1.69852194e+02  2.04961273e+02  1.45202836e+02
  3.43352793e+02 -5.68275675e-05  8.16439244e+01  1.34767958e+03
  1.87998037e+02  2.51138485e+01  1.87998037e+02  8.91262738e+01]
<PandasArray>
[         58.074375,       195.57453125, 114.71437500000002,
  47.54140625000001, 1205.1438461538455, 1757.3090703125004,
      167.397109375,  215.9439062500001,                0.0,
 295.64524687499994,       152.42109375, 140.03862500000002,
      109.747421875,      141.553515625,    1530.0292078125,
  282.3162499999999, 2674.4646

Test different models

In [175]:
pipe = make_pipeline(
    preprocessor,
    SVR())
pipe.fit(trainX, short_y_train)
print(pipe.predict(testX))
print(short_y_test.array)

[134.20115974 141.97467077 137.84293319 134.39529757 182.44430798
 182.13104907 140.20352379 142.55686067 139.3722675  146.13975821
 139.20891787 139.77374891 136.86688055 136.52717382 182.49779804
 147.32367126 171.18364818 134.85348607 139.64776856 136.52717382
 145.97943728 145.50468164 171.35512124 135.71528418 134.72032143
 150.5870323  181.45081703 134.85255594 137.49347046 142.6577931
 142.55686067 139.80094399 145.60991746 135.10172026 134.73133714
 183.02300183 137.44599236 135.44558709 137.44599236 135.70949538]
<PandasArray>
[   54.19,   209.88,    103.8,    52.74,   1271.0,  1667.44,   165.71,
   207.67,   148.03,   288.99,   139.39,    148.2,   129.89,   121.63,
  1497.05,   324.45,   3025.0,    58.85,  147.215,   121.63,  307.985,
    282.1, 2726.239,    78.73,    55.44,  411.535,  1048.33,  62.9999,
   109.26,   209.55,   207.67,   161.84,   286.25,    78.47,     75.5,
   1276.0,    96.43,    21.49,    96.43,    98.18]
Length: 40, dtype: float64


Create predictions

Add predictions to database