In [1]:
import itertools
import os

import matplotlib
matplotlib.use("agg")    # must select backend before importing pyplot
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import numpy as np
import pandas as pd

In [2]:
import dimod
from dwave.system import DWaveCliqueSampler

In [3]:
# Define MI calculations
def prob(dataset):
    """Joint probability distribution P(X) for the given data."""

    # bin by the number of different values per feature
    num_rows, num_columns = dataset.shape
    bins = [len(np.unique(dataset[:, ci])) for ci in range(num_columns)]

    prob, _ = np.histogramdd(dataset, bins)
    return prob / np.sum(prob)


def shannon_entropy(p):
    """Shannon entropy H(X) is the negative sum of P(X)log(P(X)) for probability
    distribution P(X).
    """
    p = p.flatten()
    return -sum(pi*np.log2(pi) for pi in p if pi)


def conditional_shannon_entropy(p, *conditional_indices):
    """Conditional Shannon entropy H(X|Y) = H(X,Y) - H(Y)."""

    # Sanity check on validity of conditional_indices.  In particular,
    # try to trap issues in which dimensions have been removed from
    # probability table through marginalization, but
    # conditional_indices were not updated accordingly.
    assert(all(ci < p.ndim for ci in conditional_indices))

    axis = tuple(i for i in np.arange(len(p.shape))
                 if i not in conditional_indices)

    return shannon_entropy(p) - shannon_entropy(np.sum(p, axis=axis))


def mutual_information(prob, j):
    """Mutual information between variables X and variable Y.

    Calculated as I(X; Y) = H(X) - H(X|Y)."""

    return (shannon_entropy(np.sum(prob, axis=j))
            - conditional_shannon_entropy(prob, j))


def conditional_mutual_information(p, j, *conditional_indices):
    """Mutual information between variables X and variable Y conditional on variable Z.

    Calculated as I(X;Y|Z) = H(X|Z) - H(X|Y,Z)"""

    # Compute an updated version of the conditional indices for use
    # when the probability table is marginalized over dimension j.
    # This marginalization removes one dimension, so any conditional
    # indices pointing to dimensions after this one must be adjusted
    # accordingly.
    marginal_conditional_indices = [i-1 if i > j else i for i in conditional_indices]

    return (conditional_shannon_entropy(np.sum(p, axis=j), *marginal_conditional_indices)
            - conditional_shannon_entropy(p, j, *conditional_indices))


def maximum_energy_delta(bqm):
    """Compute conservative bound on maximum change in energy when flipping a single variable"""
    return max(abs(bqm.get_linear(i))
               + sum(abs(bqm.get_quadratic(i,j))
                     for j in bqm.iter_neighbors(i))
               for i in bqm.iter_variables())


def mutual_information_bqm(dataset, features, target):
    """Build a BQM that maximizes MI between survival and a subset of features"""
    variables = ((feature, -mutual_information(prob(dataset[[target, feature]].values), 1))
                 for feature in features)
    interactions = ((f0, f1, -conditional_mutual_information(prob(dataset[[target, f0, f1]].values), 1, 2))
                    for f0, f1 in itertools.permutations(features, 2))
    return dimod.BinaryQuadraticModel(variables, interactions, 0, dimod.BINARY)


def add_combination_penalty(bqm, k, penalty):
    """Create a new BQM with an additional penalty biased towards k-combinations"""
    kbqm = dimod.generators.combinations(bqm.variables, k, strength=penalty)
    kbqm.update(bqm)
    return kbqm


def mutual_information_feature_selection(dataset, features, target, num_reads=5000):
    """Run the MIFS algorithm on a QPU solver"""
    
    # Set up a QPU sampler that embeds to a fully-connected graph of all the variables
    sampler = DWaveCliqueSampler()

    # For each number of features, k, penalize selection of fewer or more features
    selected_features = np.zeros((len(features), len(features)))

    bqm = mutual_information_bqm(dataset, features, target)

    # This ensures that the soltion will satisfy the constraints.
    penalty = maximum_energy_delta(bqm)

    for k in range(1, len(features) + 1):
        kbqm = add_combination_penalty(bqm, k, penalty)
        sample = sampler.sample(kbqm,
                                label='Example - MI Feature Selection',
                                num_reads=num_reads).first.sample
        for fi, f in enumerate(features):
            selected_features[k-1, fi] = sample[f]
    return selected_features

In [4]:
def run_demo1(dataset, target):
    """Compute MIFS for each value of k and visualize results"""

    # Rank the MI between survival and every other variable
    scores = {feature: mutual_information(prob(dataset[[target, feature]].values), 0)
              for feature in set(dataset.columns) - {target}}

    labels, values = zip(*sorted(scores.items(), key=lambda pair: pair[1], reverse=True))
    return labels, values, scores
    # Plot the MI between survival and every other variable
    # The Titanic dataset provides a familiar, intuitive example available in the public
    # domain. In itself, however, it is not a good fit for solving by sampling. Run naively on
    # this dataset, it finds numerous good solutions but is unlikely to find the exact optimal solution.
    # There are many techniques for reformulating problems for the D-Wave system that can
    # improve performance on various metrics, some of which can help narrow down good solutions
    # to closer approach an optimal solution.
    # This demo solves the problem for just the highest-scoring features.
def run_demo2(dataset, scores,target):
    # Select 8 features with the top MI ranking found above.
    keep = 8
    
    sorted_scores = sorted(scores.items(), key=lambda pair: pair[1], reverse=True)
    dataset = dataset[[column[0] for column in sorted_scores[0:keep]] + [target]]
    features = sorted(list(set(dataset.columns) - {target}))
    selected_features = mutual_information_feature_selection(dataset, features, target)
    return features, selected_features


In [10]:
def plotting_1(labels, values, target):
    plt.figure()
    #ax1 = plt.subplot(1, 0, 0)
    
    plt.gcf().subplots_adjust(bottom=0.5)
    plt.bar(np.arange(len(labels)), values)
    plt.title("Mutual Information")
    ylabel_='MI Between ' + target +  ' and Feature'
    plt.ylabel(ylabel_)
    plt.xticks(np.arange(len(labels)), labels, rotation=90, fontsize=11)
    name="plots_MI_" + target+ ".png"
    plt.savefig(name)

def plotting2(selected_features,features,target):    
    # Plot the best feature set per number of selected features
    plt.figure()
    plt.gcf().subplots_adjust(bottom=0.5)
    ax2 = plt.subplot(1, 1, 1)
    ax2.set_title("Best Feature Selection")
    ax2.set_ylabel('Number of Selected Features')
    ax2.set_xticks(np.arange(len(features)))
    ax2.set_xticklabels(features, rotation=90)
    ax2.set_yticks(np.arange(len(features)))
    ax2.set_yticklabels(np.arange(1, len(features)+1))
    # Set a grid on minor ticks
    ax2.set_xticks(np.arange(-0.5, len(features)), minor=True)
    ax2.set_yticks(np.arange(-0.5, len(features)), minor=True)
    ax2.grid(which='minor', color='black')
    ax2.imshow(selected_features, cmap=colors.ListedColormap(['white', 'red']))
    name="plots_slecetedFeatures_" + target+ ".png"
    plt.savefig(name)

In [6]:
def select_feats(df,target):
    labels, values, scores=run_demo1(df, target)
    features, selected_features=run_demo2(df, scores, target)
    plotting_1(labels, values, target)
    plotting2(selected_features, features, target)
    return features

def calc_col_score(df,col,df_lookup):
    look=df_lookup[2].loc[col]
    score_col=0.0
    num=len(df[col])-1
    for i in range(num):
        x=df[col].loc[i]-df[col].loc[i+1]
        if x==abs(x) and look == 'a' : score_col += 1
        if x==-abs(x) and look == 'd' : score_col += 1
    return score_col/(num+1e-7) 

def scoring(features,df0,cutoff_date):
    feats=['ticker','calendardate']+features
    df=df0[pd.to_datetime(df0['calendardate']) > pd.to_datetime(cutoff_date, format='%Y%m%d', errors='ignore')]
    df_scoring=df[feats]
    
    df_lookup0=pd.read_csv('light_featureDef.csv',header=None)
    df_lookup=df_lookup0.set_index(df_lookup0[0]).drop([0],axis=1)
    
    df_finScore=pd.DataFrame(index=df_scoring['ticker'].unique(),columns=['score'])
    
    for f in df_scoring.groupby('ticker'):
        score=0
        df_work=f[1].reset_index()
        for col in features:
            score+=calc_col_score(df_work,col,df_lookup)
        score/=len(features)    
        df_finScore['score'].loc[df_work['ticker'].loc[0]]=score
    
    return df_finScore.sort_values('score')

In [15]:
def generate_portfolio(cutoff_date,target,file_name): #cutoff_date is a string of format yyyymmdd , target could be price or log_ret
    dataset0= pd.read_csv('data/fundamentals_normalized.csv')
    if target == 'price':
        drop_list=['ticker','ticker.1','Unnamed: 1','index','None','dimension','calendardate','datekey','reportperiod' ,'lastupdated','log_ret']
    else:
        drop_list=['ticker','ticker.1','Unnamed: 1','index','None','dimension','calendardate','datekey','reportperiod' ,'lastupdated','price']
            
    df_target=dataset0[pd.to_datetime(dataset0['calendardate']) > pd.to_datetime(cutoff_date, format='%Y%m%d', errors='ignore')].dropna().drop(drop_list, axis = 1)
    features=select_feats(df_target,target)
    df_score=scoring(features,dataset0,cutoff_date)
    df_score.to_csv(file_name, sep=",")
    
    return df_score

In [16]:
score = generate_portfolio('20201031','price', 'data/scores.csv')