In [1]:
import sys
import project_helper
import project_tests
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (14, 8)
import os
from zipline.data import bundles
os.environ['ZIPLINE_ROOT'] = os.path.join('C:/Users/ltjsu/.zipline')
ingest_func = bundles.csvdir.csvdir_equities(['daily'], 'custom-csvdir-bundle')
bundles.register('custom-csvdir-bundle', ingest_func)
from zipline.pipeline import Pipeline
from zipline.pipeline.factors import AverageDollarVolume
from zipline.utils.calendars import get_calendar
from zipline.data.data_portal import DataPortal
from zipline.pipeline.factors import CustomFactor, DailyReturns, Returns, SimpleMovingAverage, AnnualizedVolatility
from zipline.pipeline.data import USEquityPricing
from scipy.stats import spearmanr
from IPython.display import display
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import alphalens as al
import abc
from sklearn.ensemble import VotingClassifier
from sklearn.base import clone
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import Bunch
sector_lookup = pd.read_csv(
    os.path.join(os.getcwd(), 'data', 'project_7_sector', 'labels.csv'),
    index_col='Sector_i')['Sector'].to_dict()

  from pandas.core import datetools


In [2]:
def get_pricing(data_portal, trading_calendar, assets, start_date, end_date, field='close'):
    end_dt = pd.Timestamp(end_date.strftime('%Y-%m-%d'), tz='UTC', offset='C')
    start_dt = pd.Timestamp(start_date.strftime('%Y-%m-%d'), tz='UTC', offset='C')

    end_loc = trading_calendar.closes.index.get_loc(end_dt)
    start_loc = trading_calendar.closes.index.get_loc(start_dt)

    return data_portal.get_history_window(
        assets=assets,
        end_dt=end_dt,
        bar_count=end_loc - start_loc,
        frequency='1d',
        field=field,
        data_frequency='daily')

def momentum_1yr(window_length, universe, sector):
    return Returns(window_length=window_length, mask=universe) \
        .demean(groupby=sector) \
        .rank() \
        .zscore()

def mean_reversion_5day_sector_neutral_smoothed(window_length, universe, sector):
    unsmoothed_factor = -Returns(window_length=window_length, mask=universe) \
        .demean(groupby=sector) \
        .rank() \
        .zscore()
    return SimpleMovingAverage(inputs=[unsmoothed_factor], window_length=window_length) \
        .rank() \
        .zscore()

class CTO(Returns):
    """
    Computes the overnight return, per hypothesis from
    https://papers.ssrn.com/sol3/papers.cfm?abstract_id=2554010
    """
    inputs = [USEquityPricing.open, USEquityPricing.close]
    
    def compute(self, today, assets, out, opens, closes):
        """
        The opens and closes matrix is 2 rows x N assets, with the most recent at the bottom.
        As such, opens[-1] is the most recent open, and closes[0] is the earlier close
        """
        out[:] = (opens[-1] - closes[0]) / closes[0]
        
class TrailingOvernightReturns(Returns):
    """
    Sum of trailing 1m O/N returns
    """
    window_safe = True
    
    def compute(self, today, asset_ids, out, cto):
        out[:] = np.nansum(cto, axis=0)

def overnight_sentiment_smoothed(cto_window_length, trail_overnight_returns_window_length, universe):
    cto_out = CTO(mask=universe, window_length=cto_window_length)
    unsmoothed_factor = TrailingOvernightReturns(inputs=[cto_out], window_length=trail_overnight_returns_window_length) \
        .rank() \
        .zscore()
    return SimpleMovingAverage(inputs=[unsmoothed_factor], window_length=trail_overnight_returns_window_length) \
        .rank() \
        .zscore()
        
class MarketDispersion(CustomFactor):
    inputs = [DailyReturns()]
    window_length = 1
    window_safe = True

    def compute(self, today, assets, out, returns):
        # returns are days in rows, assets across columns
        out[:] = np.sqrt(np.nanmean((returns - np.nanmean(returns))**2))
        
class MarketVolatility(CustomFactor):
    inputs = [DailyReturns()]
    window_length = 1
    window_safe = True
    
    def compute(self, today, assets, out, returns):
        mkt_returns = np.nanmean(returns, axis=1)
        out[:] = np.sqrt(260.* np.nanmean((mkt_returns-np.nanmean(mkt_returns))**2))
        
def sp(group, col1_name, col2_name):
    x = group[col1_name]
    y = group[col2_name]
    return spearmanr(x, y)[0]

def train_test_split(all_x, all_y, Test_Date):
    """
    Generate the train and test dataset.
    """
    train_inx_first = int(all_x.reset_index()['level_0'].drop_duplicates()
                          .loc[all_x.reset_index()['level_0'].drop_duplicates() == Test_Date, ].index[0])
    train_inx_last  = int(all_x.reset_index()['level_0'].drop_duplicates(keep='last')
                          .loc[all_x.reset_index()['level_0'].drop_duplicates(keep='last') == Test_Date, ].index[0])
    x_train = all_x[:train_inx_first]
    x_test  = all_x[train_inx_first:(train_inx_last+1)]
    y_train = all_y[:train_inx_first]
    y_test  = all_y[train_inx_first:(train_inx_last+1)]
    return x_train, x_test, y_train, y_test

def get_IC(factor_data):
    ls_IC = pd.DataFrame()
    
    for factor, factor_data in factor_data.items():
        ls_IC[factor] = al.performance.factor_information_coefficient(factor_data).iloc[:,0]
        print('*************************************\n#####################################\n',factor,'\n')
        al.tears.create_full_tear_sheet(factor_data)
        
    return ls_IC

def Alpha_Score(data, samples, classifier, factors,max_loss=0.35):
    # Calculate the Alpha Score
    prob_array=[-1,1]
    alpha_score = classifier.predict_proba(samples).dot(np.array(prob_array))
    return alpha_score

def non_overlapping_samples(x, y, n_skip_samples, start_i=0):
    """
    Get the non overlapping samples.

    """
    assert len(x.shape) == 2
    assert len(y.shape) == 1
    
    # TODO: Implement
    subsampled_idx = x.index.levels[0].tolist()[start_i :: n_skip_samples + 1]
#     subsampled_idx = (all_factors.weekday == start_i)
    non_overlapping_x = x.loc[subsampled_idx]
    non_overlapping_y = y.loc[subsampled_idx]
    
    return non_overlapping_x, non_overlapping_y

class NoOverlapVoterAbstract(VotingClassifier):
    @abc.abstractmethod
    def _calculate_oob_score(self, classifiers):
        raise NotImplementedError
        
    @abc.abstractmethod
    def _non_overlapping_estimators(self, x, y, classifiers, n_skip_samples):
        raise NotImplementedError
    
    def __init__(self, estimator, voting='soft', n_skip_samples=9):
        # List of estimators for all the subsets of data
        estimators = [('clf'+str(i), estimator) for i in range(n_skip_samples + 1)]
        
        self.n_skip_samples = n_skip_samples
        super().__init__(estimators, voting)
    
    def fit(self, X, y, sample_weight=None):
        estimator_names, clfs = zip(*self.estimators)
        self.le_ = LabelEncoder().fit(y)
        self.classes_ = self.le_.classes_
        
        clone_clfs = [clone(clf) for clf in clfs]
        self.estimators_ = self._non_overlapping_estimators(X, y, clone_clfs, self.n_skip_samples)
        self.named_estimators_ = Bunch(**dict(zip(estimator_names, self.estimators_)))
        self.oob_score_ = self._calculate_oob_score(self.estimators_)
        
        return self
    
def calculate_oob_score(classifiers):
    """
    Calculate the mean out-of-bag score from the classifiers.
    """  
    # TODO: Implement
    oob_score = 0
    for clf in classifiers:
        oob_score = oob_score + clf.oob_score_
    return oob_score / len(classifiers)

def non_overlapping_estimators(x, y, classifiers, n_skip_samples):
    """
    Fit the classifiers to non overlapping data.
    """
    
    # TODO: Implement
    fit_classifiers = []
    for start_idx, clf in enumerate(classifiers):
        X_resampled, y_resampled = non_overlapping_samples(x, y, n_skip_samples, start_idx)
        fit_classifiers.append(clf.fit(X_resampled, y_resampled))
        
    return fit_classifiers

class NoOverlapVoter(NoOverlapVoterAbstract):
    def _calculate_oob_score(self, classifiers):
        return calculate_oob_score(classifiers)
        
    def _non_overlapping_estimators(self, x, y, classifiers, n_skip_samples):
        return non_overlapping_estimators(x, y, classifiers, n_skip_samples)
    


In [None]:
trading_calendar = get_calendar('XSHG')
bundle_data = bundles.load('custom-csvdir-bundle')
engine = project_helper.build_pipeline_engine(bundle_data, trading_calendar)
universe_end_date = pd.Timestamp('2020-10-27', tz='UTC')
# 2020-10-27;2020-09-28;2020-08-28;2020-07-28;2020-06-24;2020-05-28;2020-04-28;2020-03-27;2020-02-28(3);2020-01-23(4);
# 2019-12-27;2019-11-28;2019-10-28;2019-09-27;2019-08-28;2019-07-26
# factor_start_date = universe_end_date - pd.DateOffset(years=6, days=1)
factor_start_date = pd.Timestamp('2014-07-22', tz='UTC')

bundle_data.equity_daily_bar_reader

universe = AverageDollarVolume(window_length=120).top(500)
sector = project_helper.Sector()

pipeline = Pipeline(screen=universe)
pipeline.add(momentum_1yr(252, universe, sector),'Momentum_1YR')
pipeline.add(mean_reversion_5day_sector_neutral_smoothed(20, universe, sector),'Mean_Reversion_Sector_Neutral_Smoothed')
pipeline.add(overnight_sentiment_smoothed(2, 10, universe),'Overnight_Sentiment_Smoothed')
pipeline.add(AnnualizedVolatility(window_length=20, mask=universe).rank().zscore(), 'volatility_20d')
pipeline.add(AnnualizedVolatility(window_length=120, mask=universe).rank().zscore(), 'volatility_120d')
pipeline.add(AverageDollarVolume(window_length=20, mask=universe).rank().zscore(), 'adv_20d')
pipeline.add(AverageDollarVolume(window_length=120, mask=universe).rank().zscore(), 'adv_120d')
pipeline.add(sector, 'sector_code')
pipeline.add(SimpleMovingAverage(inputs=[MarketDispersion(mask=universe)], window_length=20), 'dispersion_20d')
pipeline.add(SimpleMovingAverage(inputs=[MarketDispersion(mask=universe)], window_length=120), 'dispersion_120d')
pipeline.add(MarketVolatility(window_length=20), 'market_vol_20d')
pipeline.add(MarketVolatility(window_length=120), 'market_vol_120d')
pipeline.add(Returns(window_length=10, mask=universe).quantiles(2), 'return_10d')

all_factors = engine.run_pipeline(pipeline, factor_start_date, universe_end_date)
all_factors['is_Janaury'] = all_factors.index.get_level_values(0).month == 1
all_factors['is_December'] = all_factors.index.get_level_values(0).month == 12
all_factors['weekday'] = all_factors.index.get_level_values(0).weekday
all_factors['quarter'] = all_factors.index.get_level_values(0).quarter
all_factors['qtr_yr'] = all_factors.quarter.astype('str') + '_' + all_factors.index.get_level_values(0).year.astype('str')
all_factors['month_end'] = all_factors.index.get_level_values(0).isin(
                            pd.date_range(start=factor_start_date, end=universe_end_date, freq='BM'))
all_factors['month_start'] = all_factors.index.get_level_values(0).isin(
                            pd.date_range(start=factor_start_date, end=universe_end_date, freq='BMS'))
all_factors['qtr_end'] = all_factors.index.get_level_values(0).isin(
                            pd.date_range(start=factor_start_date, end=universe_end_date, freq='BQ'))
all_factors['qtr_start'] = all_factors.index.get_level_values(0).isin(
                            pd.date_range(start=factor_start_date, end=universe_end_date, freq='BQS'))

all_factors['return_10d'].replace(to_replace=-1, value=np.nan, inplace=True)

sector_columns = []
for sector_i, sector_name in sector_lookup.items():
    secotr_column = '{}行业'.format(sector_name)
    sector_columns.append(secotr_column)
    all_factors[secotr_column] = (all_factors['sector_code'] == sector_i)
all_factors['target_10D'] = all_factors.groupby(level=1)['return_10d'].shift(-10)

features = [
    'Mean_Reversion_Sector_Neutral_Smoothed', 'Momentum_1YR',
    'Overnight_Sentiment_Smoothed', 'adv_120d', 'adv_20d',
    'dispersion_120d', 'dispersion_20d', 'market_vol_120d',
    'market_vol_20d', 'volatility_20d',
    'is_Janaury', 'is_December', 'weekday',
    'month_end', 'month_start', 'qtr_end', 'qtr_start'] + sector_columns
target_label = 'target_10D'
temp = all_factors.dropna().copy()
X = temp[features]
y = temp[target_label].astype(int)

factor_names = [
    'Mean_Reversion_Sector_Neutral_Smoothed',
    'Momentum_1YR',
    'Overnight_Sentiment_Smoothed'
    ]
clf_random_state = 0
n_days = 10
n_stocks = 500
clf_parameters = {
    'criterion': 'entropy','min_samples_leaf': n_stocks * n_days,
    'oob_score': True,'n_jobs': -1, 'random_state': clf_random_state}
n_trees = 500
n_skip_samples = 9
AlphaScores = []
Dates = X.reset_index()['level_0'].drop_duplicates().values
i =0
X_test_F = pd.DataFrame()
Y_test_F = pd.DataFrame()
for Date in Dates[1223:]:
    if (i >= 22)|(Date == Dates[-1]):
        print(Date)
        i = 0
        X_train, X_test, y_train, y_test = train_test_split(X, y, Date)
        X_test_F = X_test_F.append(X_test)
        Y_test_F = Y_test_F.append(y_test)
        clf = RandomForestClassifier(n_trees, **clf_parameters)
        clf_nov = NoOverlapVoter(clf, n_skip_samples =n_skip_samples)
        y_train
        clf_nov.fit(X_train, y_train)
        AlphaScore = Alpha_Score(all_factors, X_test_F, clf_nov, factor_names)
        AlphaScores = np.hstack((AlphaScores,AlphaScore))
        X_test_F = pd.DataFrame()
        Y_test_F = pd.DataFrame()
    else:
        i = i + 1
        X_train, X_test, y_train, y_test = train_test_split(X, y, Date)
        X_test_F = X_test_F.append(X_test)
        Y_test_F = Y_test_F.append(y_test)


In [None]:
def Get_Date(all_x, all_y, Start_Date):
    """
    Generate the train and test dataset.
    """
    Start_inx = int(all_x.reset_index()['level_0'].drop_duplicates()
                    .loc[all_x.reset_index()['level_0'].drop_duplicates() == Start_Date, ].index[0])
#     End_inx = int(all_x.reset_index()['level_0'].drop_duplicates()
#                     .loc[all_x.reset_index()['level_0'].drop_duplicates() == End_Date, ].index[0])
    X_Train  = all_x[Start_inx:]
    y_Train  = all_y[Start_inx:]
#     X_Test  = all_x[End_inx:]
#     y_Test  = all_y[End_inx:]
    return X_Train, y_Train

data_portal = DataPortal(
    bundle_data.asset_finder,
    trading_calendar=trading_calendar,
    first_trading_day=bundle_data.equity_daily_bar_reader.first_trading_day,
    equity_minute_reader=None,
    equity_daily_reader=bundle_data.equity_daily_bar_reader,
    adjustment_reader=bundle_data.adjustment_reader)

def get_pricing(data_portal, trading_calendar, assets, start_date, end_date, field='close'):
    end_dt = pd.Timestamp(end_date.strftime('%Y-%m-%d'), tz='UTC', offset='C')
    start_dt = pd.Timestamp(start_date.strftime('%Y-%m-%d'), tz='UTC', offset='C')

    end_loc = trading_calendar.closes.index.get_loc(end_dt)
    start_loc = trading_calendar.closes.index.get_loc(start_dt)

    return data_portal.get_history_window(
        assets=assets,
        end_dt=end_dt,
        bar_count=end_loc - start_loc,
        frequency='1d',
        field=field,
        data_frequency='daily')
all_assets = all_factors.index.levels[1].values.tolist()
all_pricing = get_pricing(
    data_portal,
    trading_calendar,
    all_assets,
    factor_start_date,
    universe_end_date)

def get_IC(factor_data):
    ls_IC = pd.DataFrame()
    
    for factor, factor_data in factor_data.items():
        ls_IC[factor] = al.performance.factor_information_coefficient(factor_data).iloc[:,0]
        print('*************************************\n#####################################\n',factor,'\n')
        al.tears.create_full_tear_sheet(factor_data)
        
    return ls_IC

def build_factor_data(factor_data, pricing,max_loss=0.35,periods=[1,5,10]):
    return {factor_name: al.utils.get_clean_factor_and_forward_returns(factor=data, prices=pricing,max_loss=max_loss, periods=periods)
        for factor_name, data in factor_data.iteritems()}

def show_sample_results(data, samples, factors,alpha_score, pricing=all_pricing ,max_loss=0.35):

    # Add Alpha Score to rest of the factors
    alpha_score_label = '增强_ALPHA'
    factors_with_alpha = data.loc[samples.index].copy()
    factors_with_alpha[alpha_score_label] = alpha_score
    
    # Setup data for AlphaLens
    print('Cleaning Data...\n')
    factor_data = build_factor_data(factors_with_alpha[factors + [alpha_score_label]], 
                                    pricing,max_loss=max_loss,periods = [10])
    print('\n-----------------------\n')
    
    # Calculate Factor Returns and Sharpe Ratio
    factor_returns = project_helper.get_factor_returns(factor_data)
    sharpe_ratio = project_helper.sharpe_ratio(factor_returns,annualization_factor=np.sqrt(26))
    
    # Show Results
    print('             Sharpe Ratios')
    print(sharpe_ratio.round(2))
    project_helper.plot_factor_returns(factor_returns,period=10)
    project_helper.plot_factor_rank_autocorrelation(factor_data)
    
    factor_IC = get_IC(factor_data)
    print(factor_IC.mean())
    return factor_IC


X_Final, y_Final= Get_Date(X, y, '2019-07-26T00:00:00.000000000')
factor_IC = show_sample_results(all_factors, X_Final, factor_names,alpha_score = AlphaScores)

In [4]:
np.save("AlphaScores_10D_M.npy",AlphaScores)

In [11]:
X.reset_index()['level_0'].drop_duplicates().loc[X.reset_index()['level_0'].drop_duplicates() == '2019-07-26', ].index[0]

435382

In [2]:
AlphaScores = np.load("AlphaScores.npy")