## Basic data analysis 

##### About load file

In [None]:
import os
import numpy as np
import pandas as pd

INPUT_DIR = os.getcwd()
file_name = "data.csv"

load_csv = INPUT_DIR + file_name
df = pd.read_csv(load_csv, index_col=0, parse_dates=True, dayfirst=True).sort_index(ascending=True)

##### General analysis / Visualization

In [None]:
#print head of data, so show example
df.head()
#print feature data type
df.dtypes
#print data sample number
print("We have {:,} market samples in the training dataset.".format(df.shape[0]))
#print price value
print("Average Standard deviation of price change within a day in {df['price'].mean():.4f}.")
#check nan value 
df.isna().sum()
#check unique number, e.g. 'time' is one of feature
df['time'].nunique()
#check feature name equal to unknown
df[df['time'] == 'unknown'].size
#get detail information about feature
df['time'].describe()
#filter data through specific value condition
outliers = df[(df['time']>1)|(df['time']<-1)]
#groupby. Let 'time' feature be the key, other feature in the table become value
df.groupby('time') 
#limit time period
new_df = df.loc[df['time']>='2010-01-01 22:00:00+0000']
#value_counts, pick top 10.
df['feature'].value_counts().head(10)
#plot a horizontal bar plot 水平柱状图
ax = df.plot.barh(x='lab', y='val')
df['time'].plot('barh')
#plot bar 柱状图
df['time'].plot('bar')

##### Feature analysis (category part)

In [None]:
#onehot encode with scikit-learn
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
data = ['warm', 'cold', 'hot']
value = array(data)
#integer encode
label_encoder = LabelEncoder()
#from 0,1,2,3,...
integer_encoded = label_encoder.fit_transform(values)

#binary encode
onehot_encoder = OneHotEncoder(sparse= False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoded = onehot_encoder.fit_transform(integer_encoded)

#invert oneHot to label
#here, [0,:] means the first example
inverted = label_encoder.inverse_transform([argmax(onehot_encoded[0,:])])

##### Feature analysis (text part)

##### Feature analysis (Time series part)

In [None]:
#log return
def log_returns(x, lag=1):
    """Calculate log returns between adjacent close prices"""
    return np.log(x) - np.log(x.shift(lag))

In [None]:
#date process
def to_date(date_column):
    return data_column.apply(lambda x: pd.to_datetime(x).strftime('%d-%m-%Y'))

In [None]:
#time frequency
#resample and interpolate
#two type resample method: Upsampling, Downsampling.




In [None]:
#ETL extract, clean, transform, load
#extract, combine available feature into one table
def extract():
    #df = pd.DataFrame()
    lme_stocks = data['lme_stocks']
    frames = [lme_stocks, lme_prices, ted_, ted_spread_, bdi_, vix_, gsci_, shfe_price]
    df = pd.concat(frames, axis = 1)
    df.columns = ['lme_stocks', 'lme_prices', 'ted', 'ted_spread', 'bdi', 'vix', 'gsci', 'shfe_price']
    
def clean():
    logging.warning('Starting to clean data')
    #remove the label is null
    df.dropna(axis=0, how='any', subset=['shfe_price'], inplace=True)
    check = df.isnull().values.any()
    
    # method 1: fill nan with average value
    #if check:
    #    mean = df.mean #mean of each column of the dataframe
    #    replace_values = {'Open': mean[0], 'High':mean[1], 'Close':mean[2]}
    #    df = df.fillna(value= replace_values)
    #    
    #Converting stock values to float and Volumns to int
    #df[['Open', 'High', 'Close']] = df[['Open', 'High', 'Close']].astyoe(float)
    #df['Volumn'] = df['Volumn'].astype(int)
    #
    #
    # method 2: fill nan with last value
    if check:
        #fill nan by forward value 
        df = df.replace([np.inf, -np.inf], np.nan).fillna(method='ffill')
    
def transform():
    logging.warning('Performing data transformation')
    #making new column
    #log return or percentage change
    df['log_return_price'] = log_return(df['log_return_price'])
    
    # Find first and last dates for which all features available
    start_date = df.loc[df.notnull().all(axis=1)].index.min()
    end_date = df.loc[df.notnull().all(axis=1)].index.max()
    df = df.loc[start_date:end_date, :]
    
    # Maybe no need
    #normalised to zero mean and standard variance
    #for item in df_Xy.columns:
    #    df_Xy[item] = preprocessing.scale(df_Xy[item])     

In [None]:
#check mean, std, and plot to show result.
#function to plot multi-figures
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(10,10))
axes[0,0].plot(cu_log_return.index,cu_log_return[['shfe_price']])
axes[0,0].set_title('Cu_Shfe_price')
axes[0,1].plot(cu_log_return.index,cu_log_return[['lme_stocks']])
axes[0,1].set_title('Cu_lme_stocks')
axes[1,0].plot(cu_log_return.index,cu_log_return[['lme_prices']])
axes[1,0].set_title('Cu_lme_prices')
axes[1,1].plot(cu_log_return.index,cu_log_return[['ted']])
axes[1,1].set_title('ted')
plt.show()

In [None]:
#plot autocorrelation and partial autocorrelation to show stationary.
import statsmodels.api as sm  
def plot_acf_pacf(input_data):   
    fig = plt.figure(figsize=(12,8))
    ax1 = fig.add_subplot(211)
    fig = sm.graphics.tsa.plot_acf(input_data, lags=40, alpha=.05, ax=ax1)
    ax2 = fig.add_subplot(212)
    fig = sm.graphics.tsa.plot_pacf(input_data, lags=40, alpha=.05, ax=ax2)
    print("ACF and PACF of input_data")
    plt.show()
    
#or use Dickey-Fuller test to check stationary.
from pandas import tseries
from statsmodels.tsa.stattools import adfuller
def adf_test(input_data, feature_name):
    # perform Augmented Dickey Fuller test
    print('Results of Augmented Dickey-Fuller test:', feature_name)
    test_data = input_data[[feature_name]]
    y = test_data.values[:,0]
    dftest = adfuller(y, autolag='AIC')
    dfoutput = pd.Series(dftest[0:4], index=['test statistic', 'p-value', '# of lags', '# of observations'])
    for key, value in dftest[4].items():
        dfoutput['Critical Value ({})'.format(key)] = value
    print(dftest)
    #print(dfoutput)
#     if dftest[1]<0.05 and dftest[4].get('5%')<0.05:
#         print(feature_name, 'is stationary.')
#         print('test statistic', dftest[0])
#         print('p-value = ', dftest[1])
#         print(type(dftest[1]))
#         print('Critical Value (5%) = ', dftest[4].get('5%'))
#     else: 
#         print(feature, 'is not stationary.')
adf_test(al_log_return, 'shfe_price')

##### Feature selection 

In [None]:
#feature cross correlation
#method 1: numpy and panda
corr = df.corr()
corr.style.background_gradient()
#also
corr.style.background_gradient.set_precision(2)

#method 2: 
import matplotlib.pyplot as plt
#matplotlib.style.use('ggplot')
plt.imshow(X.corr(), camp= plt.cm.Reds, interpolation='nearest')
plt.colorbar()
tick_marks = [i for i in range(len(X.columns))]
plt.xticks(tick_marks, X.columns, rotation='vertical')
plt.yticks(tick_marks, X.columns)
plt.show()

##### Unbalanced data set solution

In [None]:
#set weights?

## Modeling 

##### XGBoost

In [3]:
from xgboost import XGBClassifier
from sklearn import model_selection
from sklearn.metrics import accuracy_score

X_train, X_test, Y_train, Y_test = model_selection.train_test_split(X, Y, test_size=0.1, random_state=99)

xgb = XGBClassifier(n_jobs=4, n_estimators=300, max_depth=6, eta=0.15)
xgb.fit(X_train, Y_train)
print("Accuracy Score: ", accuracy_score(xgb.predict(X_test), Y_test)

##### Gaussian Process Regression

In [None]:
def gaussian_process(X, Y, x_pred):
       
    X = X.values
    Y = Y.values
    x_pred = x_pred.values
    kernels = 1.0 * DotProduct(sigma_0=1.0, sigma_0_bounds=(1e-05, 100000.0)) \
#     +1.0 * RationalQuadratic(length_scale=1.0) \
#     + 1.0 * WhiteKernel(noise_level=1e-1) \
#     + 1.0 * Matern(length_scale=1.0, nu=1.5) \
#     + 1.0 * RBF(length_scale=1.0, length_scale_bounds=(1e-05, 100000.0))\
#     + 1.0 * ExpSineSquared(length_scale=1.0, periodicity=1.0, length_scale_bounds=(1e-05, 100000.0), periodicity_bounds=(1e-05, 100000.0))
#     + 1.0 * ConstantKernel(constant_value=1.0, constant_value_bounds=(1e-05, 100000.0))\
        
    gp = GaussianProcessRegressor(kernel=kernels, n_restarts_optimizer=10, alpha = 0)
    
    # Fit to data using Maximum Likelihood Estimation of the parameters
    # learn the hyperparameters and scale of each kernel
    gp.fit(X, Y)

    
    forecast= gp.predict(x_pred, return_std=True, return_cov=False)
    y_pred = forecast[0][0]
    
    return y_pred

## Evaluation Metric

##### Time series evaluation

In [None]:
import sys
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from numpy.linalg import norm
from dtw import dtw
from scipy.stats import spearmanr

def evaluate(model_name, org_price, pred_price):
    
    org_price = org_price.reshape(len(org_price),1)
    pred_price = pred_price.reshape(len(pred_price),1)
    
    mse = compute_mse_error(org_price, pred_price)
    mae = compute_mean_absolute_error(org_price, pred_price)
    rmse =compute_root_mean_square_error(org_price, pred_price)
    dtw_dist, dtw_cost, dtw_acc, dtw_path = compute_dtw(org_price, pred_price)
    corr_rank, p_value = compute_spearmanr(org_price, pred_price)
    
    df = pd.DataFrame(index=[model_name],columns=['MSE', 'MAE', 'RMSE' ,'DTW', 'Spearmanr', 'p-value'])
    df.set_value(model_name, 'MSE', mse)
    df.set_value(model_name, 'MAE', mae)
    df.set_value(model_name, 'RMSE', rmse)
    df.set_value(model_name, 'DTW', dtw_cost[-1][-1])
    df.set_value(model_name, 'Spearmanr', corr_rank)
    df.set_value(model_name, 'p_value', p_value)
    return df

def compute_daily_error(org_price, pred_price):
    return org_price - pred_price

#MSE
def compute_mse_error(org_price, pred_price):
    return mean_squared_error(org_price, pred_price)

#MAE
def compute_mean_absolute_error(org_price, pred_price):  
    return mean_absolute_error(org_price, pred_price)

#RMSE
def compute_root_mean_square_error(org_price, pred_price):
    return np.sqrt(((org_price - pred_price) ** 2).mean())

#smaller dtw Accumulated Distortion(cost[-1][-1]) means more similar time series
def compute_dtw(x, y):   #Dynamic time warping
    dist, cost, acc, path = dtw(x, y, dist=lambda x, y: norm(x - y, ord=1))
    return dist, cost, acc, path

def compute_spearmanr(org_price, pred_price):
    correlation_rank = spearmanr(org_price, pred_price)
    return correlation_rank[0], correlation_rank[1]