Imports

In [56]:
import pandas as pd
import numpy as np
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import train_test_split
from os import path, getcwd
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer


Open file

In [57]:
# path is ../sql/db/__NAMEOFCSV__.csv
pathToParent = path.dirname(getcwd())
csvFile = pathToParent + "/sqlite/db/DummyFintech.csv"
completeDF = pd.read_csv(csvFile, header = 0)




Create Test and trianing sets

In [58]:
trainDF, testDF = train_test_split(completeDF, test_size=.2, random_state=435)


trainX = trainDF.drop(columns=['dayafter', 'monthafter', 'threemonthsafter'])

long_y_train = trainDF['threemonthsafter']
medium_y_train = trainDF['monthafter']
short_y_train = trainDF['dayafter']

testX = testDF.drop(columns=['dayafter', 'monthafter', 'threemonthsafter'])

long_y_test = testDF['threemonthsafter']
medium_y_test = testDF['monthafter']
short_y_test = testDF['dayafter']

print(trainX)

         date  daybefore                                         transcript
3  1681147907        3.0  transcript thisly is something bad text at lea...
8  1681147907      105.0  transcript transcript transcript transcript ca...
6  1681147907      102.0  transcript transcript transcript transcript ca...
0  1681147907        2.0  transcript this is my trancript cat cat dog do...
4  1681147907      101.8  transcript transcript transcript transcript ca...
1  1681147907        1.5  transcript this is this transcript what what c...
5  1681147907      101.1       transcript transcript this that cat dog that
2  1681147907       10.0  transcript this is your transcript transcript ...


Create Dummy models

In [59]:
shortDummy = DummyClassifier()
mediumDummy = DummyClassifier()
longDummy = DummyClassifier()

shortDummy.fit(trainX, short_y_train)
mediumDummy.fit(trainX, medium_y_train)
longDummy.fit(trainX, long_y_train)

print(shortDummy.predict(trainX))
print(mediumDummy.predict(testX))
print(longDummy.predict(testX))

['102.2' '102.2' '102.2' '102.2' '102.2' '102.2' '102.2' '102.2']
[3. 3.]
[4. 4.]


Create preprocessing

In [60]:
preprocessor = ColumnTransformer([('transcript', CountVectorizer(stop_words="english", strip_accents='ascii'),'transcript'),
                                ('daybefore/date', StandardScaler(), ['daybefore', 'date'])])


Create pipelines

In [64]:
pipe = make_pipeline(
    preprocessor, 
    LinearRegression())

pipe.fit(trainX, short_y_train)
print(pipe.predict(testX))
print(testX)
print(short_y_test)

[100.8303644  102.95789464]
         date  daybefore                                         transcript
7  1681147907      101.0  transcript transcript transcript transcript ca...
9  1681147907      103.0  transcript transcript transcript this that cat...
7      101.2
9    103.3.2
Name: dayafter, dtype: object


Test different models

Creating a grid search to find best hyperperameters. 

Create predictions

Add predictions to database