# Global Stock Selection Strategy

Subset of 1500 stocks - [Q1500US](https://www.quantopian.com/posts/the-q500us-and-q1500us) and [here](https://www.quantopian.com/posts/q1500us-q500us-default-universe-update-minimum-market-cap).


Let us consider a global stock selection model that predicts 1-month stock returns across the Q1500 US universe, using the 14 risk factors shown below. 

**14 [Fundamental](https://www.quantopian.com/help/fundamentals#asset-classification) Risk Factors**

* Price-to-book ratio
* Gross profit / Total assets
* ROE
* Net margin
* Asset  turnover
* Gearing
* Forward earnings yield
* Cash flow yield
* Dividend yield
* Market capitalisation
* Volatility
* Price Momentum 1 month
* Earnings quality
* Price Oscillator

## References

[130/30](https://www.math.nyu.edu/faculty/avellane/Lo13030.pdf)

In [1]:
from quantopian.research import run_pipeline
from quantopian.pipeline import Pipeline
from quantopian.pipeline.factors import Latest
from quantopian.pipeline.data.builtin import USEquityPricing
from quantopian.pipeline.data import morningstar
from quantopian.pipeline import CustomFactor
from quantopian.pipeline.factors import Returns
from quantopian.pipeline.classifiers.morningstar import Sector
from quantopian.pipeline.filters import Q500US, Q1500US
from quantopian.pipeline.data.zacks import EarningsSurprises

import pandas as pd
import numpy as np
from time import time
import math

import matplotlib.pyplot as plt
from sklearn import ensemble, preprocessing, metrics
from sklearn.linear_model import LogisticRegression

In [2]:
n_fwd_days = 21

In [3]:
bs = morningstar.balance_sheet
cfs = morningstar.cash_flow_statement
is_ = morningstar.income_statement
or_ = morningstar.operation_ratios
er = morningstar.earnings_report
v = morningstar.valuation
vr = morningstar.valuation_ratios

def make_pipeline():

    base_universe = Q1500US()

    class StdDev(CustomFactor):
        '''
        3 months realized volatility
        '''
        def compute(self, today, asset_ids, out, values):
            # Calculates the column-wise standard deviation, ignoring NaNs
            out[:] = np.nanstd(values, axis=0)

    class Momentum(CustomFactor):
        # Default inputs
        inputs = [USEquityPricing.close]
        """
        1-Month Price Momentum:
        1-month closing price rate of change.
        https://www.pnc.com/content/dam/pnc-com/pdf/personal/wealth-investments/WhitePapers/FactorAnalysisFeb2014.pdf # NOQA
        Notes:
        High value suggests momentum (shorter term)
        Equivalent to analysis of returns (1-month window)
        """
        # Compute momentum
        def compute(self, today, assets, out, close):
            out[:] = close[-1] / close[0]

    class Mean_Reversion_1M(CustomFactor):
        inputs = [Returns(window_length=21)]
        window_length = 252

        def compute(self, today, assets, out, monthly_rets):
            out[:] = (monthly_rets[-1] - np.nanmean(monthly_rets, axis=0)) / \
                np.nanstd(monthly_rets, axis=0)

    class Price_Oscillator(CustomFactor):
        """
        4/52-Week Price Oscillator:
        Average close prices over 4-weeks divided by average close
        prices over 52-weeks all less 1.
        https://www.math.nyu.edu/faculty/avellane/Lo13030.pdf
        Notes:
        High value suggests momentum
        """
        inputs = [USEquityPricing.close]
        window_length = 252

        def compute(self, today, assets, out, close):
            four_week_period = close[-20:]
            out[:] = (np.nanmean(four_week_period, axis=0) /
                      np.nanmean(close, axis=0)) - 1.

    def Earnings_Quality():
        return cfs.operating_cash_flow.latest / \
               EarningsSurprises.eps_act.latest
    
    Price_Momentum_1M = Momentum(window_length=21)      
    std_dev = StdDev(inputs=[USEquityPricing.close], window_length=63, mask=base_universe)
    Price_Oscillator = Price_Oscillator()
    
    return Pipeline(
        columns={'pb_ratio': vr.pb_ratio.latest,
                 'gp_ta': is_.gross_profit.latest / bs.total_assets.latest,
                 'roe': or_.roe.latest,
                 'net_margin': or_.net_margin.latest,
                 'assets_turnover': or_.assets_turnover.latest,
                 'gearing': bs.total_debt.latest / bs.total_equity.latest,
                 'forward_earning_yield': vr.forward_earning_yield.latest,
                 'cf_yield': vr.cf_yield.latest,
                 'dividend_yield': vr.dividend_yield.latest,
                 'market_cap': v.market_cap.latest,
                 'vol': std_dev,
                 'Price_Momentum_1M': Price_Momentum_1M,
                 'Earnings_Quality': cfs.operating_cash_flow.latest / EarningsSurprises.eps_act.latest,
                 'Price_Oscillator': Price_Oscillator,
                 
                },
        screen=base_universe
  )

In [4]:
start = pd.Timestamp("2015-01-01")
end = pd.Timestamp("2015-06-01")
end_1m = pd.Timestamp("2015-07-15")
test_start = pd.Timestamp("2015-05-01")

In [5]:
start_timer = time()
results = run_pipeline(make_pipeline(), start, end)
end_timer = time()
print "Time to run pipeline %.2f secs" % (end_timer - start_timer)



Time to run pipeline 45.60 secs


In [6]:
results.index.names = ['date', 'security']

In [7]:
results

Unnamed: 0_level_0,Unnamed: 1_level_0,Earnings_Quality,Price_Momentum_1M,Price_Oscillator,assets_turnover,cf_yield,dividend_yield,forward_earning_yield,gearing,gp_ta,market_cap,net_margin,pb_ratio,roe,vol
date,security,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2015-01-02 00:00:00+00:00,Equity(2 [ARNC]),,0.913295,0.097341,0.169928,0.0630,0.0076,0.0656,0.725073,0.035963,1.868430e+10,0.023882,1.5144,0.012437,0.840949
2015-01-02 00:00:00+00:00,Equity(24 [AAPL]),9.330986e+09,0.963444,0.225832,0.185417,0.0867,0.0164,0.0697,0.316414,0.069052,6.599120e+11,0.201007,5.9160,0.072838,6.343392
2015-01-02 00:00:00+00:00,Equity(41 [ARCB]),8.183514e+07,1.060856,0.171412,0.639185,0.1212,0.0032,0.0556,0.257198,0.408964,1.205120e+09,0.027581,2.1868,0.036211,4.553984
2015-01-02 00:00:00+00:00,Equity(62 [ABT]),1.937097e+09,0.995797,0.111621,0.121540,0.0578,0.0193,0.0515,0.362679,0.065226,6.879960e+10,0.105408,3.0059,0.023145,1.616591
2015-01-02 00:00:00+00:00,Equity(67 [ADSK]),1.133333e+09,0.988969,0.110709,0.130929,0.0452,0.0004,0.0151,0.329292,0.112312,1.376730e+10,0.017314,6.0689,0.004701,3.173699
2015-01-02 00:00:00+00:00,Equity(76 [TAP]),9.844898e+08,0.973093,0.122957,0.076075,0.0860,0.0198,0.0572,0.411348,0.033802,1.387820e+10,-0.029452,1.6526,-0.003963,2.240364
2015-01-02 00:00:00+00:00,Equity(110 [ACXM]),1.432889e+08,1.019608,-0.176652,0.200592,0.0840,0.0063,0.0324,0.449514,0.041604,1.581810e+09,-0.005938,2.3314,-0.002274,1.454109
2015-01-02 00:00:00+00:00,Equity(114 [ADBE]),1.817873e+09,0.989114,0.085783,0.100878,0.0323,0.0002,0.0346,0.223982,0.084589,3.645030e+10,0.068285,5.4151,0.010864,3.659694
2015-01-02 00:00:00+00:00,Equity(122 [ADI]),3.800913e+08,1.012582,0.110466,0.102853,0.0493,0.0266,0.0521,0.183440,0.070854,1.728900e+10,0.133491,3.6337,0.022262,4.480362
2015-01-02 00:00:00+00:00,Equity(128 [ADM]),4.248148e+09,0.975239,0.137061,0.443224,0.1371,0.0182,0.0659,0.273954,0.035150,3.404990e+10,0.041232,1.6835,0.036971,3.211012


In [8]:
start_timer = time()
assets = results.index.levels[1].unique()
pricing = get_pricing(assets, start_date=start, end_date=end_1m, fields='price')
end_timer = time()
print "Time to extract prices %.2f secs" % (end_timer - start_timer)

Time to extract prices 1.11 secs


In [9]:
pricing.head()

Unnamed: 0,Equity(2 [ARNC]),Equity(24 [AAPL]),Equity(41 [ARCB]),Equity(53 [ABMD]),Equity(62 [ABT]),Equity(67 [ADSK]),Equity(76 [TAP]),Equity(110 [ACXM]),Equity(114 [ADBE]),Equity(122 [ADI]),...,Equity(48073 [CRC]),Equity(48091 [VA]),Equity(48104 [PGRE]),Equity(48126 [HABT]),Equity(48169 [KLXI]),Equity(48220 [LC]),Equity(48317 [JUNO]),Equity(48384 [QRVO]),Equity(48486 [BOX]),Equity(48543 [SHAK])
2015-01-02 00:00:00+00:00,15.817,108.418,45.733,37.3,44.233,59.53,73.055,19.605,72.33,54.827,...,5.416,43.8,18.541,31.49,40.24,24.91,48.24,,,
2015-01-05 00:00:00+00:00,14.91,105.392,45.055,37.09,44.252,58.66,72.64,19.425,71.99,53.83,...,4.982,41.61,18.718,30.9,40.63,24.01,46.0,67.62,,
2015-01-06 00:00:00+00:00,15.0,105.373,43.012,36.13,43.73,57.5,72.274,19.08,70.52,52.566,...,4.753,41.0,18.837,30.64,40.8,22.89,51.49,64.66,,
2015-01-07 00:00:00+00:00,15.398,106.88,41.935,37.28,44.095,57.37,74.579,19.33,71.12,53.119,...,4.628,39.72,19.182,32.59,40.5,22.89,52.78,66.65,,
2015-01-08 00:00:00+00:00,15.856,110.967,42.922,38.96,44.991,58.8,76.883,19.79,72.91,54.057,...,4.767,41.05,19.242,33.19,41.62,23.2,52.75,67.69,,


Calculate the 1 month forward returns

In [10]:
df = pricing.pct_change(n_fwd_days).shift(-n_fwd_days)
df = df[:end]
df

Unnamed: 0,Equity(2 [ARNC]),Equity(24 [AAPL]),Equity(41 [ARCB]),Equity(53 [ABMD]),Equity(62 [ABT]),Equity(67 [ADSK]),Equity(76 [TAP]),Equity(110 [ACXM]),Equity(114 [ADBE]),Equity(122 [ADI]),...,Equity(48073 [CRC]),Equity(48091 [VA]),Equity(48104 [PGRE]),Equity(48126 [HABT]),Equity(48169 [KLXI]),Equity(48220 [LC]),Equity(48317 [JUNO]),Equity(48384 [QRVO]),Equity(48486 [BOX]),Equity(48543 [SHAK])
2015-01-02 00:00:00+00:00,0.045963,0.085087,-0.116546,0.417962,0.025660,-0.077608,0.053617,-0.015047,-0.006083,-0.028088,...,0.128877,-0.195890,0.039372,-0.027628,0.010686,-0.185468,-0.146973,,,
2015-01-05 00:00:00+00:00,0.099531,0.125180,-0.133614,0.460771,0.010666,-0.037845,0.057737,-0.008494,0.003751,-0.013580,...,0.267162,-0.149483,0.018485,-0.005178,-0.020182,-0.164931,-0.161087,0.010352,,
2015-01-06 00:00:00+00:00,0.136200,0.133412,-0.101739,0.536120,0.033364,-0.022087,0.070205,-0.031971,0.036585,0.032302,...,0.407953,-0.151707,0.012582,-0.019909,-0.052941,-0.114024,-0.200427,0.042221,,
2015-01-07 00:00:00+00:00,0.073776,0.107934,-0.086753,0.554453,0.021454,-0.005229,0.023599,-0.003621,0.020107,0.022308,...,0.323250,-0.133182,-0.044729,-0.030684,-0.037284,-0.107471,-0.252747,-0.012003,,
2015-01-08 00:00:00+00:00,-0.015767,0.074391,-0.113811,0.458162,-0.020449,-0.024660,-0.018001,-0.065184,-0.007955,-0.003644,...,0.376757,-0.190743,-0.050826,-0.067792,-0.064632,-0.069397,-0.253081,-0.058502,,
2015-01-09 00:00:00+00:00,-0.010657,0.093766,-0.058038,0.502604,-0.001370,0.003775,-0.056707,-0.072663,0.012669,0.014974,...,0.230924,-0.198296,-0.039977,0.011254,-0.067119,-0.076238,-0.234936,-0.030705,,
2015-01-12 00:00:00+00:00,-0.042782,0.147290,-0.067597,0.528527,-0.009465,0.017982,-0.051457,-0.050203,0.026719,0.027134,...,0.393457,-0.181168,-0.047745,-0.038643,-0.049480,0.053333,-0.319350,-0.011132,,
2015-01-13 00:00:00+00:00,-0.008324,0.151543,-0.056947,0.543932,0.013448,0.030340,-0.031510,-0.021527,0.056665,0.059200,...,0.608354,-0.146133,-0.035179,-0.041732,-0.016841,0.016572,-0.285891,0.017074,,
2015-01-14 00:00:00+00:00,0.060820,0.161944,-0.008148,0.577406,0.040873,0.075519,-0.028957,-0.012416,0.094237,0.066556,...,0.587919,-0.113856,-0.034308,-0.005425,-0.009519,0.043272,-0.218515,0.085218,,
2015-01-15 00:00:00+00:00,0.053423,0.201108,-0.010993,0.633791,0.056222,0.097093,0.018378,0.065872,0.097169,0.115901,...,0.617456,-0.089461,-0.019377,0.039447,-0.001533,0.017655,-0.095826,0.079397,,


In [11]:
df1 = df.stack()
df1

2015-01-02 00:00:00+00:00  Equity(2 [ARNC])          0.045963
                           Equity(24 [AAPL])         0.085087
                           Equity(41 [ARCB])        -0.116546
                           Equity(53 [ABMD])         0.417962
                           Equity(62 [ABT])          0.025660
                           Equity(67 [ADSK])        -0.077608
                           Equity(76 [TAP])          0.053617
                           Equity(110 [ACXM])       -0.015047
                           Equity(114 [ADBE])       -0.006083
                           Equity(122 [ADI])        -0.028088
                           Equity(128 [ADM])        -0.063594
                           Equity(161 [AEP])         0.026646
                           Equity(166 [AES])        -0.080893
                           Equity(168 [AET])         0.063326
                           Equity(185 [AFL])        -0.032088
                           Equity(197 [AGCO])        0.087274
        

In [12]:
df1.index.names = ['date', 'security']

In [13]:
df1

date                       security              
2015-01-02 00:00:00+00:00  Equity(2 [ARNC])          0.045963
                           Equity(24 [AAPL])         0.085087
                           Equity(41 [ARCB])        -0.116546
                           Equity(53 [ABMD])         0.417962
                           Equity(62 [ABT])          0.025660
                           Equity(67 [ADSK])        -0.077608
                           Equity(76 [TAP])          0.053617
                           Equity(110 [ACXM])       -0.015047
                           Equity(114 [ADBE])       -0.006083
                           Equity(122 [ADI])        -0.028088
                           Equity(128 [ADM])        -0.063594
                           Equity(161 [AEP])         0.026646
                           Equity(166 [AES])        -0.080893
                           Equity(168 [AET])         0.063326
                           Equity(185 [AFL])        -0.032088
                    

In [14]:
df1.name = 'returns'

In [15]:
df_combined = results.join(df1)
df_combined

Unnamed: 0_level_0,Unnamed: 1_level_0,Earnings_Quality,Price_Momentum_1M,Price_Oscillator,assets_turnover,cf_yield,dividend_yield,forward_earning_yield,gearing,gp_ta,market_cap,net_margin,pb_ratio,roe,vol,returns
date,security,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2015-01-02 00:00:00+00:00,Equity(2 [ARNC]),,0.913295,0.097341,0.169928,0.0630,0.0076,0.0656,0.725073,0.035963,1.868430e+10,0.023882,1.5144,0.012437,0.840949,0.045963
2015-01-02 00:00:00+00:00,Equity(24 [AAPL]),9.330986e+09,0.963444,0.225832,0.185417,0.0867,0.0164,0.0697,0.316414,0.069052,6.599120e+11,0.201007,5.9160,0.072838,6.343392,0.085087
2015-01-02 00:00:00+00:00,Equity(41 [ARCB]),8.183514e+07,1.060856,0.171412,0.639185,0.1212,0.0032,0.0556,0.257198,0.408964,1.205120e+09,0.027581,2.1868,0.036211,4.553984,-0.116546
2015-01-02 00:00:00+00:00,Equity(62 [ABT]),1.937097e+09,0.995797,0.111621,0.121540,0.0578,0.0193,0.0515,0.362679,0.065226,6.879960e+10,0.105408,3.0059,0.023145,1.616591,0.025660
2015-01-02 00:00:00+00:00,Equity(67 [ADSK]),1.133333e+09,0.988969,0.110709,0.130929,0.0452,0.0004,0.0151,0.329292,0.112312,1.376730e+10,0.017314,6.0689,0.004701,3.173699,-0.077608
2015-01-02 00:00:00+00:00,Equity(76 [TAP]),9.844898e+08,0.973093,0.122957,0.076075,0.0860,0.0198,0.0572,0.411348,0.033802,1.387820e+10,-0.029452,1.6526,-0.003963,2.240364,0.053617
2015-01-02 00:00:00+00:00,Equity(110 [ACXM]),1.432889e+08,1.019608,-0.176652,0.200592,0.0840,0.0063,0.0324,0.449514,0.041604,1.581810e+09,-0.005938,2.3314,-0.002274,1.454109,-0.015047
2015-01-02 00:00:00+00:00,Equity(114 [ADBE]),1.817873e+09,0.989114,0.085783,0.100878,0.0323,0.0002,0.0346,0.223982,0.084589,3.645030e+10,0.068285,5.4151,0.010864,3.659694,-0.006083
2015-01-02 00:00:00+00:00,Equity(122 [ADI]),3.800913e+08,1.012582,0.110466,0.102853,0.0493,0.0266,0.0521,0.183440,0.070854,1.728900e+10,0.133491,3.6337,0.022262,4.480362,-0.028088
2015-01-02 00:00:00+00:00,Equity(128 [ADM]),4.248148e+09,0.975239,0.137061,0.443224,0.1371,0.0182,0.0659,0.273954,0.035150,3.404990e+10,0.041232,1.6835,0.036971,3.211012,-0.063594


# Remove Ticker Name

remove all the ticker names. We do not need them to build our model

In [16]:
res = df_combined.copy()

In [17]:
res = res.reset_index().set_index('date')
res.pop('security')
res

Unnamed: 0_level_0,Earnings_Quality,Price_Momentum_1M,Price_Oscillator,assets_turnover,cf_yield,dividend_yield,forward_earning_yield,gearing,gp_ta,market_cap,net_margin,pb_ratio,roe,vol,returns
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2015-01-02 00:00:00+00:00,,0.913295,0.097341,0.169928,0.0630,0.0076,0.0656,0.725073,0.035963,1.868430e+10,0.023882,1.5144,0.012437,0.840949,0.045963
2015-01-02 00:00:00+00:00,9.330986e+09,0.963444,0.225832,0.185417,0.0867,0.0164,0.0697,0.316414,0.069052,6.599120e+11,0.201007,5.9160,0.072838,6.343392,0.085087
2015-01-02 00:00:00+00:00,8.183514e+07,1.060856,0.171412,0.639185,0.1212,0.0032,0.0556,0.257198,0.408964,1.205120e+09,0.027581,2.1868,0.036211,4.553984,-0.116546
2015-01-02 00:00:00+00:00,1.937097e+09,0.995797,0.111621,0.121540,0.0578,0.0193,0.0515,0.362679,0.065226,6.879960e+10,0.105408,3.0059,0.023145,1.616591,0.025660
2015-01-02 00:00:00+00:00,1.133333e+09,0.988969,0.110709,0.130929,0.0452,0.0004,0.0151,0.329292,0.112312,1.376730e+10,0.017314,6.0689,0.004701,3.173699,-0.077608
2015-01-02 00:00:00+00:00,9.844898e+08,0.973093,0.122957,0.076075,0.0860,0.0198,0.0572,0.411348,0.033802,1.387820e+10,-0.029452,1.6526,-0.003963,2.240364,0.053617
2015-01-02 00:00:00+00:00,1.432889e+08,1.019608,-0.176652,0.200592,0.0840,0.0063,0.0324,0.449514,0.041604,1.581810e+09,-0.005938,2.3314,-0.002274,1.454109,-0.015047
2015-01-02 00:00:00+00:00,1.817873e+09,0.989114,0.085783,0.100878,0.0323,0.0002,0.0346,0.223982,0.084589,3.645030e+10,0.068285,5.4151,0.010864,3.659694,-0.006083
2015-01-02 00:00:00+00:00,3.800913e+08,1.012582,0.110466,0.102853,0.0493,0.0266,0.0521,0.183440,0.070854,1.728900e+10,0.133491,3.6337,0.022262,4.480362,-0.028088
2015-01-02 00:00:00+00:00,4.248148e+09,0.975239,0.137061,0.443224,0.1371,0.0182,0.0659,0.273954,0.035150,3.404990e+10,0.041232,1.6835,0.036971,3.211012,-0.063594


try replacing nan with 0. Alternative is to remove the data.

Prefer not to use mean as is the case for `sklean imputer` and it would distort the model

In [18]:
for e in res.columns:
    res[e]  = res[e].replace(np.nan, 0)
    res[e]  = res[e].replace([np.inf, -np.inf], 0)

In [19]:
res

Unnamed: 0_level_0,Earnings_Quality,Price_Momentum_1M,Price_Oscillator,assets_turnover,cf_yield,dividend_yield,forward_earning_yield,gearing,gp_ta,market_cap,net_margin,pb_ratio,roe,vol,returns
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2015-01-02 00:00:00+00:00,0.000000e+00,0.913295,0.097341,0.169928,0.0630,0.0076,0.0656,0.725073,0.035963,1.868430e+10,0.023882,1.5144,0.012437,0.840949,0.045963
2015-01-02 00:00:00+00:00,9.330986e+09,0.963444,0.225832,0.185417,0.0867,0.0164,0.0697,0.316414,0.069052,6.599120e+11,0.201007,5.9160,0.072838,6.343392,0.085087
2015-01-02 00:00:00+00:00,8.183514e+07,1.060856,0.171412,0.639185,0.1212,0.0032,0.0556,0.257198,0.408964,1.205120e+09,0.027581,2.1868,0.036211,4.553984,-0.116546
2015-01-02 00:00:00+00:00,1.937097e+09,0.995797,0.111621,0.121540,0.0578,0.0193,0.0515,0.362679,0.065226,6.879960e+10,0.105408,3.0059,0.023145,1.616591,0.025660
2015-01-02 00:00:00+00:00,1.133333e+09,0.988969,0.110709,0.130929,0.0452,0.0004,0.0151,0.329292,0.112312,1.376730e+10,0.017314,6.0689,0.004701,3.173699,-0.077608
2015-01-02 00:00:00+00:00,9.844898e+08,0.973093,0.122957,0.076075,0.0860,0.0198,0.0572,0.411348,0.033802,1.387820e+10,-0.029452,1.6526,-0.003963,2.240364,0.053617
2015-01-02 00:00:00+00:00,1.432889e+08,1.019608,-0.176652,0.200592,0.0840,0.0063,0.0324,0.449514,0.041604,1.581810e+09,-0.005938,2.3314,-0.002274,1.454109,-0.015047
2015-01-02 00:00:00+00:00,1.817873e+09,0.989114,0.085783,0.100878,0.0323,0.0002,0.0346,0.223982,0.084589,3.645030e+10,0.068285,5.4151,0.010864,3.659694,-0.006083
2015-01-02 00:00:00+00:00,3.800913e+08,1.012582,0.110466,0.102853,0.0493,0.0266,0.0521,0.183440,0.070854,1.728900e+10,0.133491,3.6337,0.022262,4.480362,-0.028088
2015-01-02 00:00:00+00:00,4.248148e+09,0.975239,0.137061,0.443224,0.1371,0.0182,0.0659,0.273954,0.035150,3.404990e+10,0.041232,1.6835,0.036971,3.211012,-0.063594


# Splitting Training and Testing Datasets

In this case, we will hold back last month as our test dataset.

The test data date need to be changed manually. The number of data points vary from day to day. Difficult to set automatically.


In [20]:
X = res.copy()
Y = X.pop('returns')

In [21]:
X

Unnamed: 0_level_0,Earnings_Quality,Price_Momentum_1M,Price_Oscillator,assets_turnover,cf_yield,dividend_yield,forward_earning_yield,gearing,gp_ta,market_cap,net_margin,pb_ratio,roe,vol
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2015-01-02 00:00:00+00:00,0.000000e+00,0.913295,0.097341,0.169928,0.0630,0.0076,0.0656,0.725073,0.035963,1.868430e+10,0.023882,1.5144,0.012437,0.840949
2015-01-02 00:00:00+00:00,9.330986e+09,0.963444,0.225832,0.185417,0.0867,0.0164,0.0697,0.316414,0.069052,6.599120e+11,0.201007,5.9160,0.072838,6.343392
2015-01-02 00:00:00+00:00,8.183514e+07,1.060856,0.171412,0.639185,0.1212,0.0032,0.0556,0.257198,0.408964,1.205120e+09,0.027581,2.1868,0.036211,4.553984
2015-01-02 00:00:00+00:00,1.937097e+09,0.995797,0.111621,0.121540,0.0578,0.0193,0.0515,0.362679,0.065226,6.879960e+10,0.105408,3.0059,0.023145,1.616591
2015-01-02 00:00:00+00:00,1.133333e+09,0.988969,0.110709,0.130929,0.0452,0.0004,0.0151,0.329292,0.112312,1.376730e+10,0.017314,6.0689,0.004701,3.173699
2015-01-02 00:00:00+00:00,9.844898e+08,0.973093,0.122957,0.076075,0.0860,0.0198,0.0572,0.411348,0.033802,1.387820e+10,-0.029452,1.6526,-0.003963,2.240364
2015-01-02 00:00:00+00:00,1.432889e+08,1.019608,-0.176652,0.200592,0.0840,0.0063,0.0324,0.449514,0.041604,1.581810e+09,-0.005938,2.3314,-0.002274,1.454109
2015-01-02 00:00:00+00:00,1.817873e+09,0.989114,0.085783,0.100878,0.0323,0.0002,0.0346,0.223982,0.084589,3.645030e+10,0.068285,5.4151,0.010864,3.659694
2015-01-02 00:00:00+00:00,3.800913e+08,1.012582,0.110466,0.102853,0.0493,0.0266,0.0521,0.183440,0.070854,1.728900e+10,0.133491,3.6337,0.022262,4.480362
2015-01-02 00:00:00+00:00,4.248148e+09,0.975239,0.137061,0.443224,0.1371,0.0182,0.0659,0.273954,0.035150,3.404990e+10,0.041232,1.6835,0.036971,3.211012


In [22]:
X.describe()

Unnamed: 0,Earnings_Quality,Price_Momentum_1M,Price_Oscillator,assets_turnover,cf_yield,dividend_yield,forward_earning_yield,gearing,gp_ta,market_cap,net_margin,pb_ratio,roe,vol
count,153892.0,153892.0,153892.0,153892.0,153892.0,153892.0,153892.0,153892.0,153892.0,153892.0,153892.0,153892.0,153892.0,153892.0
mean,564325400.0,1.013826,0.053471,0.24103,0.080779,0.016856,0.106258,1.501582,0.113323,15831580000.0,-5.750417,10.903957,0.080525,2.935473
std,7164344000.0,0.089293,0.186413,1.499594,0.108993,0.031126,2.04354,45.92821,1.491631,39329970000.0,152.666098,75.692961,2.689894,4.015758
min,-55958330000.0,0.0,-0.74617,0.0,-0.6248,0.0,-0.6107,-1532.063444,-0.726025,500177000.0,-10093.75,0.0,-43.854726,0.04093
25%,40295880.0,0.968769,-0.033795,0.071713,0.0419,0.0,0.0382,0.219125,0.030702,2549150000.0,0.021695,1.8076,0.009265,1.164419
50%,159372600.0,1.008329,0.064911,0.157153,0.0648,0.0104,0.0571,0.617158,0.05969,4873520000.0,0.075278,3.11645,0.029595,1.993462
75%,479038500.0,1.052823,0.145538,0.267769,0.1026,0.0232,0.0736,1.205746,0.103935,12272100000.0,0.155856,5.7981,0.053484,3.349752
max,454850000000.0,2.31663,1.936554,65.122877,2.0565,0.6667,93.9755,2183.589744,65.122877,774691000000.0,8.833056,3107.2443,90.322895,83.758538


In [23]:
Y

date
2015-01-02 00:00:00+00:00    0.045963
2015-01-02 00:00:00+00:00    0.085087
2015-01-02 00:00:00+00:00   -0.116546
2015-01-02 00:00:00+00:00    0.025660
2015-01-02 00:00:00+00:00   -0.077608
2015-01-02 00:00:00+00:00    0.053617
2015-01-02 00:00:00+00:00   -0.015047
2015-01-02 00:00:00+00:00   -0.006083
2015-01-02 00:00:00+00:00   -0.028088
2015-01-02 00:00:00+00:00   -0.063594
2015-01-02 00:00:00+00:00    0.026646
2015-01-02 00:00:00+00:00   -0.080893
2015-01-02 00:00:00+00:00    0.063326
2015-01-02 00:00:00+00:00   -0.032088
2015-01-02 00:00:00+00:00    0.087274
2015-01-02 00:00:00+00:00    0.037381
2015-01-02 00:00:00+00:00   -0.020496
2015-01-02 00:00:00+00:00   -0.084858
2015-01-02 00:00:00+00:00   -0.028550
2015-01-02 00:00:00+00:00    0.212642
2015-01-02 00:00:00+00:00    0.080417
2015-01-02 00:00:00+00:00    0.189756
2015-01-02 00:00:00+00:00   -0.081680
2015-01-02 00:00:00+00:00   -0.060168
2015-01-02 00:00:00+00:00    0.024119
2015-01-02 00:00:00+00:00   -0.040879
2015-01

In [24]:
Y = Y > 0

In [25]:
Y

date
2015-01-02 00:00:00+00:00     True
2015-01-02 00:00:00+00:00     True
2015-01-02 00:00:00+00:00    False
2015-01-02 00:00:00+00:00     True
2015-01-02 00:00:00+00:00    False
2015-01-02 00:00:00+00:00     True
2015-01-02 00:00:00+00:00    False
2015-01-02 00:00:00+00:00    False
2015-01-02 00:00:00+00:00    False
2015-01-02 00:00:00+00:00    False
2015-01-02 00:00:00+00:00     True
2015-01-02 00:00:00+00:00    False
2015-01-02 00:00:00+00:00     True
2015-01-02 00:00:00+00:00    False
2015-01-02 00:00:00+00:00     True
2015-01-02 00:00:00+00:00     True
2015-01-02 00:00:00+00:00    False
2015-01-02 00:00:00+00:00    False
2015-01-02 00:00:00+00:00    False
2015-01-02 00:00:00+00:00     True
2015-01-02 00:00:00+00:00     True
2015-01-02 00:00:00+00:00     True
2015-01-02 00:00:00+00:00    False
2015-01-02 00:00:00+00:00    False
2015-01-02 00:00:00+00:00     True
2015-01-02 00:00:00+00:00    False
2015-01-02 00:00:00+00:00    False
2015-01-02 00:00:00+00:00    False
2015-01-02 00:0

In [26]:
X_train = X[X.index < test_start]
X_test = X[X.index >= test_start]
Y_train = Y[Y.index < test_start]
Y_test = Y[Y.index >= test_start]

In [27]:
X_train.shape, Y_train.shape

((122535, 14), (122535,))

In [28]:
X_test.shape, Y_test.shape

((31357, 14), (31357,))

# Training our Logistic ML

In [29]:
start = time()
scaler = preprocessing.StandardScaler()
clf = LogisticRegression(random_state=0)
X_train_trans = scaler.fit_transform(X_train)
clf.fit(X_train_trans, Y_train)
end = time()

In [30]:
print "Time taken to train ML: %0.2f secs" % (end - start)

Time taken to train ML: 0.38 secs


# Evaluating our ML Classifier

In [31]:
# Transform test data
X_test_trans = scaler.transform(X_test)

In [32]:
Y_pred = clf.predict(X_test_trans)

In [33]:
from sklearn.metrics import confusion_matrix, accuracy_score

In [34]:
cm = confusion_matrix(Y_test, Y_pred)
print(accuracy_score(Y_test, Y_pred))

0.500589979909


In [35]:
print(cm)

[[  921 15018]
 [  642 14776]]


***

***