In [2]:
import os
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.model_selection import GridSearchCV

In [3]:
filename = 'minute_df_18Aug.csv'
seed = 54
np.random.seed(seed)

In [4]:
feature_col = ['tweets_per_min','nlikes_per_min','nreplies_per_min','nretweets_per_min','volume_avg','n_trades_avg','compound_avg']
target = 'up_down'

In [18]:
def read_and_clean(location,filename,target,feature_col):
    os.chdir(location)

    df = pd.read_csv(filename,engine = 'python').sample(frac = 0.25, random_state=seed)
    df['date'] = pd.to_datetime(df['date'])
    if 'Unnamed: 0' in df.columns:
        df = df.drop('Unnamed: 0',axis = 1).set_index('date')
    else:
        df = df.set_index('date')
    if target == None:
        target = 'time_to_sell'
        df['time_to_sell'] = np.where(df['percent_change_avg'] < 0, False,True)
        print(df.groupby('time_to_sell').count()['percent_change_avg'])
        y = np.array(df['time_to_sell'])
    else:
        y = np.invert(np.array(df[target]))

    X = df[feature_col]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=seed)

    return [X, X_train, X_test], [y, y_train, y_test], df  

In [19]:
X,Y,df = read_and_clean(os.getcwd(),filename,target,feature_col)


In [20]:
def get_performance_scores(y_test, y_predict, y_predict_prob, eps=1e-15, beta=0.5):
    from sklearn import metrics
    # Scores keys.
    metric_keys = ["accuracy", "precision", "recall", "f1", "fbeta", "log_loss", "AUC"]
    # Score values.
    metric_values = [None]*len(metric_keys)
    metric_values[0] = metrics.accuracy_score(y_test, y_predict)
    metric_values[1] = metrics.precision_score(y_test, y_predict)
    metric_values[2] = metrics.recall_score(y_test, y_predict)
    metric_values[3] = metrics.f1_score(y_test, y_predict)
    metric_values[4] = metrics.fbeta_score(y_test, y_predict, beta=beta)
    metric_values[5] = metrics.log_loss(y_test, y_predict_prob[:, 1], eps=eps)
    metric_values[6] = metrics.roc_auc_score(y_test, y_predict_prob[:, 1])
    perf_metrics = dict(zip(metric_keys, metric_values))
    return(perf_metrics)

In [21]:
svclassifier = SVC(kernel = 'linear', 
                   probability = True)

In [22]:
# Fit the model.
svclassifier.fit(X[1], Y[1])

In [13]:
for y in Y:
    unique, counts = np.unique(y, return_counts=True)
    print('Y:', unique, counts)


[array([ True, False,  True, ...,  True,  True,  True]), array([ True,  True,  True, ...,  True,  True,  True]), array([ True, False,  True, ..., False,  True,  True])]
Y: [False  True] [12887 12663]
Y: [False  True] [8993 8892]
Y: [False  True] [3894 3771]
