In [3]:
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import xgboost as xgb
from scipy.stats import norm
import datetime

def rev_ema(x):
    x = list(x)
    x.reverse()
    df = pd.Series(x)
    df_ewm = df.ewm(alpha=0.98).mean()
    df_ewm = list(df_ewm)
    df_ewm.reverse()
    df_ewm = pd.Series(df_ewm)
    return df_ewm


def hist_count(l1, l2, nums):
    df = pd.DataFrame({"a": l1, "b": l2})
    df['c'] = 0
    df = df.groupby(['a', 'b']).count().reset_index()
    w = np.zeros([nums, nums])
    for _, row in df.iterrows():
        w[int(row.a), int(row.b)] = row.c
    return w


def train_op(X_train, y_train, params, num_rounds, X_v, y_v, early):
    d_train = xgb.DMatrix(X_train, y_train)
    dtest = xgb.DMatrix(X_v, y_v)
    eval_set = [(d_train, 'train'), (dtest, 'eval')]
    bst = xgb.train(params, d_train, num_boost_round=num_rounds, evals=eval_set, early_stopping_rounds=early)
    return bst


def f_importance(bst, f_to_use):
    importance_dic= {}
    for f in f_to_use:
        importance_dic[f] = []
    sum = 0
    for feature_name in bst.get_fscore():
        sum = sum + bst.get_fscore()[feature_name]
    for feature_name, feature_socre in bst.get_fscore().items():
        importance_dic[feature_name].append(feature_socre/sum)
    # xgb.plot_importance(bst)
    # plt.show()
    return importance_dic


def f_importance_bar(importance_dic):
    top = 20
    name = []
    importance_mean = []
    importance_mean_dic = {}
    for feature, importance in importance_dic.items():
        name.append(feature)
        temp = np.array(importance).mean()
        if np.isnan(temp):
            importance_mean.append(0)
        else:
            importance_mean.append(temp)
        importance_mean_dic[feature] = importance_mean[-1]
    name = np.array(name)
    sorted_imp = [[importance_mean[i], i] for i in range(len(importance_mean))]
    sorted_imp = sorted(sorted_imp, key=lambda student: student[0], reverse=True)
    sorted_imp = np.array(sorted_imp[:top])
    top_index = sorted_imp[:, 1]
    top_index = [int(i) for i in top_index]

    sns.barplot(y=name[top_index], x=sorted_imp[:, 0], orient='h', alpha=0.8, color='red')
    plt.ylabel('Factor', fontsize=10)
    plt.xlabel('Importance', fontsize=10)
    plt.xticks(rotation='horizontal')
    plt.yticks(fontsize=17)



def add_return(re_list, turnover=None):
    if turnover == None:
        turnover = len(re_list)*[0]
    net_worth = [1]
    for i in range(len(re_list)):
        r = re_list[i]
        net_worth.append(net_worth[-1] + r - 0.002*turnover[i])
    return net_worth


def nshift(df, gb_col, col, n):
    for i in range(1, n):
        gb = df.groupby([gb_col])[col]
        df[col + str(i)] = gb.transform(lambda x: x.shift(i))
    return df


def norm_pcol(df, col, adjust_col, n):
    for i in range(1, n):
        df[col + str(i)] = (df[col + str(i)])/(df['close']  - df[adjust_col] + df[adjust_col + str(i)])
    return df


def norm_vcol(df, col, n):
    for i in range(1, n):
        df[col + str(i)] = df[col + str(i)]/df[col]
    return df



def max_decline_acc(r_path):
    d = [[0, 0]]
    max_tag = [r_path[0], 0]
    for i in range(1, len(r_path)):
        r = r_path[i]
        if max_tag[0] < r:
            max_tag[0] = r
            max_tag[1] = i
            d.append([0, i])
        elif max_tag[0] >= r:
            d.append([(r - max_tag[0])/max_tag[0], max_tag[1]])
    return np.array(d)


def max_decline_add(r_path):
    d = [[0, 0]]
    max_tag = [r_path[0], 0]
    for i in range(1, len(r_path)):
        r = r_path[i]
        if max_tag[0] < r:
            max_tag[0] = r
            max_tag[1] = i
            d.append([0, i])
        elif max_tag[0] >= r:
            d.append([r - max_tag[0], max_tag[1]])
    return np.array(d)


def rank2norm(df, col1, col2):
    df["rank"] = df.groupby([col1])[col2].transform( lambda x:x.rank())
    df["cont"] = df.groupby([col1])[col2].transform(lambda x:x.count())
    df["cont"] = df["cont"] + 1
    df["norm"] = df["rank"] / df["cont"]
    df["norm"] = df["norm"].apply(lambda x: InverseNormalCDF(x))
    return df["norm"]



def InverseNormalCDF(quantile):
    if quantile <1 and quantile>0:
        tag = 1
    else:
        return np.nan
    a = [2.50662823884, -18.61500062529, 41.39119773534, -25.44106049637]
    b = [-8.47351093090, 23.08336743743, -21.06224101826, 3.13082909833]
    c = [0.3374754822726147, 0.9761690190917186, 0.1607979714918209, 0.0276438810333863, 0.0038405729373609, 0.0003951896511919, \
         0.0000321767881768, 0.0000002888167364, 0.0000003960315187]
    if (quantile >= 0.5) and (quantile <= 0.92):
        y = quantile - 0.5
        r = y*y
        num = 0 
        denom = 0
        for i in range(3,-1,-1):
            num = num*r + a[i]
            denom = denom*r + b[i]
        return (y*num / (denom*r+1))
    
    elif (quantile > 0.92) and (quantile < 1):
        num = 0
        r = np.log(-np.log(1-quantile))
        for i in range(8,-1,-1):
            num = num*r+c[i];
        return num
    else:
        return -1.0 * InverseNormalCDF(1 - quantile)


In [4]:
def time_series_desribe(x, shiftn, col):
    res = []
    for i in range(1,shiftn):
        res.append(x[col + str(i)])
    res = np.std(res)
    return res
    

In [5]:
def plot_conditional_expectation(x, y, quantile_n = 20):
    step = 1 / quantile_n
    bin_cut = np.percentile(x, list(np.append(np.arange(0,1,step), 1)*100) )
    bin_cut = np.unique(bin_cut)
    tmp = pd.DataFrame({'x':x.flatten(), 'y':y.flatten()})
    tmp['bin_cut'] = pd.cut(tmp['x'],bin_cut,include_lowest=True)
    groupby_mean = tmp.groupby('bin_cut').mean().reset_index()

    plt.figure(figsize=(12,4))
    plt.plot(groupby_mean['x'], groupby_mean['y'], '-o')
    plt.axvline(x=0, color='r', ls='-.')
    plt.axhline(y=0, color='r', ls='-.')
    plt.show()
        
    return groupby_mean