In [1]:
import pandas as pd
import numpy as np

In [2]:
dataset = pd.read_csv('dataset.csv')

print(dataset.head())


     price  target  sentiment
0  11873.4       1  -0.100000
1  11761.0       0   0.579167
2  11792.5       1   0.516667
3  11591.0       1   0.800000
4  11892.0       1  -0.376984


In [3]:
x = dataset.loc[:,['price','sentiment']]
print(x.head())

     price  sentiment
0  11873.4  -0.100000
1  11761.0   0.579167
2  11792.5   0.516667
3  11591.0   0.800000
4  11892.0  -0.376984


In [4]:
y = dataset.loc[:,'target']
print('0 implies price decreased in compared to the previous day. 1 implies price increased')
print(y.head())

0 implies price decreased in compared to the previous day. 1 implies price increased
0    1
1    0
2    1
3    1
4    1
Name: target, dtype: int64


In [5]:
# Checking for class imbalance.
print(y.value_counts())

1    122
0    103
Name: target, dtype: int64


In [6]:
from sklearn.model_selection import train_test_split

In [7]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.33)

In [8]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

In [9]:
from sklearn.linear_model import LogisticRegression

In [10]:
# Tested multiple solver-penalty combinations.  liblinear with l2 was found to be consitently better

tuned_parameter = [{'C': [0.1,0.5,1,5,10,20,50,100]}]

clf_LR = GridSearchCV(LogisticRegression(max_iter=50000, solver='liblinear', penalty='l2'),
                           tuned_parameter, scoring='f1_macro')

clf_LR.fit(x_train, y_train)
print("Best parameters set found on training set:\n")
print(clf_LR.best_params_)
print()
    
    
y_true, y_pred = y_test, clf_LR.predict(x_test)

print("Detailed classification report:\n")
print(classification_report(y_true, y_pred))
print()

Best parameters set found on training set:

{'C': 10}

Detailed classification report:

              precision    recall  f1-score   support

           0       0.53      0.34      0.42        29
           1       0.66      0.80      0.73        46

    accuracy                           0.63        75
   macro avg       0.59      0.57      0.57        75
weighted avg       0.61      0.63      0.61        75




In [11]:
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

In [12]:
# For SVM's, scaling of the data is required
scaler = StandardScaler()
x_scaled_train = scaler.fit_transform(x_train)
x_scaled_test = scaler.transform(x_test)

In [13]:
# SVC, NuSVC and LinearSVC were tested. SVC was found to be the best

tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1, 0.1, 0.01, 0.001, 1e-3, 1e-4],
                     'C': [0.1, 1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [0.1, 1, 10, 100, 1000]}]

clf_svm = GridSearchCV(SVC(), tuned_parameters, scoring='f1_macro')
clf_svm.fit(x_scaled_train, y_train)

print("Best parameters set found on development set:\n")
print(clf_svm.best_params_)
print()

print("Detailed classification report:\n")

y_true, y_pred = y_test, clf_svm.predict(x_scaled_test)
print(classification_report(y_true, y_pred))
print()

Best parameters set found on development set:

{'C': 10, 'gamma': 1, 'kernel': 'rbf'}

Detailed classification report:

              precision    recall  f1-score   support

           0       0.41      0.38      0.39        29
           1       0.62      0.65      0.64        46

    accuracy                           0.55        75
   macro avg       0.52      0.52      0.52        75
weighted avg       0.54      0.55      0.54        75




In [14]:
from sklearn.ensemble import RandomForestClassifier

In [15]:
clf_RF = RandomForestClassifier(max_depth=2, random_state=0)
clf_RF.fit(x_train, y_train)
print("Detailed classification report:\n")

y_true, y_pred = y_test, clf_RF.predict(x_test)
print(classification_report(y_true, y_pred))
print()

Detailed classification report:

              precision    recall  f1-score   support

           0       0.41      0.41      0.41        29
           1       0.63      0.63      0.63        46

    accuracy                           0.55        75
   macro avg       0.52      0.52      0.52        75
weighted avg       0.55      0.55      0.55        75




In [16]:
from tensorflow import keras
from keras.wrappers.scikit_learn import KerasClassifier

Using TensorFlow backend.


In [17]:
from sklearn.pipeline import Pipeline

In [18]:
def create_baseline():
    # create model
    optimizer = keras.optimizers.Nadam()
    accuracy = keras.metrics.BinaryAccuracy()
    model = keras.models.Sequential()    
    model.add(keras.layers.Dense(300, input_dim=2, activation="relu"))    
    model.add(keras.layers.Dropout(rate=0.2))
    model.add(keras.layers.Dense(100, activation="relu"))
    model.add(keras.layers.Dropout(rate=0.2))
    model.add(keras.layers.Dense(1, activation="sigmoid"))
    
    # Compile model
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=[accuracy])
    
    return model

In [19]:
from sklearn.preprocessing import MinMaxScaler

In [20]:
scaler = MinMaxScaler()
x_scaled_train = scaler.fit_transform(x_train)
x_scaled_test = scaler.transform(x_test)

In [21]:
estimators = []
estimators.append(('mlp', KerasClassifier(build_fn=create_baseline, epochs=5, batch_size=5, verbose=1)))
pipe = Pipeline(estimators)
pipe.fit(x_scaled_train, y_train)
y_res = pipe.predict(x_scaled_test)
print(classification_report(y_test, y_res))

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Train on 150 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
              precision    recall  f1-score   support

           0       0.56      0.17      0.26        29
           1       0.64      0.91      0.75        46

    accuracy                           0.63        75
   macro avg       0.60      0.54      0.51        75
weighted avg       0.61      0.63      0.56        75



In [24]:
estimators = []
estimators.append(('mlp', KerasClassifier(build_fn=create_baseline, epochs=10, batch_size=5, verbose=1)))
pipe = Pipeline(estimators)
pipe.fit(x_scaled_train, y_train)
y_res = pipe.predict(x_scaled_test)
print(classification_report(y_test, y_res))

Train on 150 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
              precision    recall  f1-score   support

           0       0.47      0.28      0.35        29
           1       0.64      0.80      0.71        46

    accuracy                           0.60        75
   macro avg       0.55      0.54      0.53        75
weighted avg       0.57      0.60      0.57        75

