In [4]:
import numpy as np
import pandas as pd
from collections import Counter

In [10]:
def process_data_for_labels(ticker):
    days = 7
    df = pd.read_csv('sp500all.csv', index_col = 0)
    tickers = df.columns.values.tolist()
    df.fillna(0 , inplace = True)
    for i in range(1, days + 1):
        df['{}_{}d'.format(ticker, i)] = (df[ticker].shift(-i) - df[ticker]) / df[ticker]
    
    df.fillna(0, inplace = True)
    return df, tickers
        

In [11]:
def buy_sell_hold(*args):
    cols = [c for c in args]
    requirement = 0.02
    for col in cols:
        if col > requirement:
            return 1
        if col < -requirement:
            return -1
    return 0


In [12]:
def extract_feature_sets(ticker):
    df, tickers = process_data_for_labels(ticker)
    df['{}_target'.format(ticker)] = list(map(buy_sell_hold,
                                              df['{}_1d'.format(ticker)],
                                              df['{}_2d'.format(ticker)],
                                              df['{}_3d'.format(ticker)],
                                              df['{}_4d'.format(ticker)],
                                              df['{}_5d'.format(ticker)],
                                              df['{}_6d'.format(ticker)],
                                              df['{}_7d'.format(ticker)]
                                             ))
    vals = df['{}_target'.format(ticker)].values.tolist()
    str_vals = [str(i) for i in vals]
    print('Data spread: ', Counter(str_vals))
    df.fillna(0 , inplace = True)
    df = df.replace([np.inf, -np.inf], np.nan)
    df.dropna(inplace = True)
    
    df_vals = df[[ticker for ticker in tickers]].pct_change()
    df_vals = df_vals.replace([np.inf, -np.inf], np.nan)
    df_vals.fillna(0 , inplace = True)
    
    X = df_vals.values
    y = df['{}_target'.format(ticker)].values
    return X, y, df

In [13]:
extract_feature_sets('A')

Data spread:  Counter({'1': 862, '-1': 650, '0': 383})


(array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.02078633,  0.01982699,  0.        , ...,  0.0673139 ,
          0.02342341,  0.        ],
        [-0.00116061,  0.03806564, -0.00825739, ..., -0.01569215,
          0.02711269,  0.        ],
        ...,
        [-0.00155022,  0.00129789, -0.00070458, ..., -0.03965854,
          0.01339603,  0.00431156],
        [-0.02024845,  0.01177356, -0.01349714, ...,  0.00435186,
         -0.01548198, -0.01912381],
        [ 0.00924404, -0.00672571, -0.01133348, ...,  0.00322668,
         -0.00288229,  0.00974837]]),
 array([1, 1, 1, ..., 0, 0, 0]),
                    MMM        ABT       ABBV        ABMD         ACN  \
 Date                                                                   
 2012-12-31   76.014526  27.044691   0.000000   13.440000   56.649178   
 2013-01-02   77.594589  27.580906  25.678120   13.690000   58.829960   
 2013-01-03   77.504532  28.630791  25.466085   

In [16]:
from sklearn import svm, model_selection as cross_validation, neighbors
from sklearn.ensemble import VotingClassifier, RandomForestClassifier

In [30]:
def do_ml(ticker):
    X, y, df = extract_feature_sets(ticker)
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size = 0.25)
    
    clf = VotingClassifier([('lsvc', svm.LinearSVC()),
                           ('knn', neighbors.KNeighborsClassifier()),
                           ('rfor', RandomForestClassifier())
                           ])
    
    clf.fit(X_train, y_train)
    confidence = clf.score(X_test, y_test)
    print('Accuracy: ', confidence)
    predictions = clf.predict(X_test)
    print('Predicted spread: ', Counter(predictions))
    
    return confidence

In [32]:
do_ml('MMM')

Data spread:  Counter({'0': 700, '1': 673, '-1': 522})
Accuracy:  0.41983122362869196
Predicted spread:  Counter({0: 228, 1: 163, -1: 83})


0.41983122362869196