In [1]:
import numpy as np
import pandas as pd
import datetime
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold, TimeSeriesSplit
from sklearn.metrics import mean_squared_error, r2_score
import xgboost as xgb
import joblib


USE_STANDARDIZATION = 0       #####  1 做标准化，0 不做标准化
CV_TYPE = 2       ##### 交叉验证 1 KFold，2 时间顺序
FOLD_NUM = 5      ##### 折数

dataset_s = pd.read_csv('./test/RB99_1m_Turnover_31000_12120_591.csv_tz80_Train_10877_p.csv')   ####### 训练集文件
dataset = dataset_s

num_xunlian = len(dataset_s)
data_1_size = 1213    ###### 测试数据行数  ###############
m_size = 25     ####### 测试多少个月 #######
buy = 1     ##### 多 ###################
sell = 0     ##### 空 ####################
rrr = 0.25     ###### 系数 ###################
m = 1000       ###### 总资金 ###################


### 模型训练指标保存

res1 = []
res2 = []
res3 = []
res4 = []
res5 = []
res6 = []
res7 = []
resP = []
resR = []
resF = []


### 交叉验证

if CV_TYPE == 1:
    ### KFold
    cv = KFold(n_splits=FOLD_NUM, shuffle=True, random_state=369)
else:
    ### 时间顺序
    cv = TimeSeriesSplit(n_splits=FOLD_NUM)

for j in range(1,21):   ########## 从n维训练到多少维
    num = j
    
    X = dataset.drop('A0', axis=1)
    y = dataset['A0']
    
    ### 是否标准化
    if USE_STANDARDIZATION == 1:
        ### 做标准化
        scaler = StandardScaler()
        X_processed = scaler.fit_transform(X)  ### 标准化后的特征
    else:
        ### 不做标准化
        scaler = None  ### 无需scaler
        X_processed = X.copy()  ### 原始特征
    
    ### PCA降维
    pca = PCA(n_components=num, random_state=369)
    X_pca = pca.fit_transform(X_processed)
    
    ### XGBoost模型
    model = xgb.XGBRegressor(random_state=369)
    
    ### 定义参数网格进行调优
    param_grid = {
        'n_estimators': [100, 200],
        'learning_rate': [0.01, 0.1],
        'max_depth': [3, 5]
    }
    
    ### 网格搜索优化模型使用指定的交叉验证
    grid_search = GridSearchCV(model, param_grid, cv=cv, scoring='neg_root_mean_squared_error', n_jobs=-1)
    grid_search.fit(X_pca, y)
    
    ### 获取最佳模型
    best_model = grid_search.best_estimator_
    
    ### 交叉验证评估模型
    if CV_TYPE == 1:
        ### KFold可以直接使用cross_val_score
        cv_mse = -cross_val_score(best_model, X_pca, y, cv=cv, scoring='neg_mean_squared_error')
        cv_r2 = cross_val_score(best_model, X_pca, y, cv=cv, scoring='r2')
    else:
        ### 时间顺序验证需要循环计算指标
        cv_mse = []
        cv_r2 = []
        
        for train_idx, test_idx in cv.split(X_pca):
            ### 在训练集上训练模型
            best_model.fit(X_pca[train_idx], y.iloc[train_idx])
            ### 在测试集上预测
            y_pred = best_model.predict(X_pca[test_idx])
            ### 计算指标
            mse = mean_squared_error(y.iloc[test_idx], y_pred)
            r2 = r2_score(y.iloc[test_idx], y_pred)
            cv_mse.append(mse)
            cv_r2.append(r2)
        
        cv_mse = np.array(cv_mse)
        cv_r2 = np.array(cv_r2)
    
    ### 计算RMSE
    cv_rmse = np.sqrt(cv_mse)
    
    ### 评估指标
    metrics_df = pd.DataFrame({
        'Metric': ['MSE', 'RMSE', 'R2'],
        'Mean': [cv_mse.mean(), cv_rmse.mean(), cv_r2.mean()],
        'Std': [cv_mse.std(), cv_rmse.std(), cv_r2.std()]
    })
    
    ### 保存结果
    metrics_df.to_csv('./temp/' + str(j) + 'r.csv', index=False)
    
    ### 保存最终模型
    pipeline = {
        'use_standardization': USE_STANDARDIZATION,  ### 记录是否做了标准化
        'scaler': scaler,  ### 若未做标准化，scaler为None
        'pca': pca,
        'model': best_model,
        'cv_type': CV_TYPE,  ### 记录交叉验证类型
        'fold_num': FOLD_NUM  ### 记录折数
    }
    joblib.dump(pipeline, './temp/' + str(num) + 'x.pkl')
    
    ### 加载测试数据
    data = pd.read_csv('./test/RB99_1m_Turnover_31000_12120_591.csv_tz80_Test_1213_PCA.csv')   ###### 测试集数据
    
    ### 应用模型进行推理
    pipeline = joblib.load('./temp/' + str(num) + 'x.pkl')
    ### 获取状态
    use_std = pipeline['use_standardization']
    scaler = pipeline['scaler']
    pca = pipeline['pca']
    model = pipeline['model']
    
    ### 处理测试集特征
    X_test = data.drop('A0', axis=1) if 'A0' in data.columns else data
    if use_std == 1:
        ### 训练时做了标准化，测试时也做
        X_test_processed = scaler.transform(X_test)
    else:
        ### 训练时没做标准化，测试时直接用原始特征
        X_test_processed = X_test.copy()
    
    ### 测试集PCA降维
    X_test_pca = pca.transform(X_test_processed)
    predictions = model.predict(X_test_pca)
    
    ### 推理结果
    data['prediction_label'] = predictions
    n_preds = data['prediction_label'][num_xunlian:(num_xunlian+data_1_size)]
    n_preds = n_preds.reset_index(drop=True)
    
    ### 保存预测结果
    with open('./temp/' + str(num) + 'x.txt', 'a') as Note:
        for i in range(0, data_1_size):
            Note.write(str(n_preds[i]) + '\n')
            
            
    ### 处理信号
    file_name = './temp/Show.csv'
    df = pd.read_csv(file_name)
    path = './temp/' + str(j) + 'x.txt'
    df2 = pd.read_csv(path, header=None, names=['state_x'])
    df['state_x'] = df2['state_x']
    
    df.loc[df['state_x'] > 0, 'low'] = 1
    df.loc[df['state_x'] < 0, 'low'] = 0
    
    ### 过滤信号(可选)  ###########################################
    ft = 0.6
    df.loc[df['state_x'] > ft, 'low'] = 1
    df.loc[(0 < df['state_x']) & (df['state_x'] <= ft), 'low'] = 2
    df.loc[df['state_x'] < -ft, 'low'] = 0
    df.loc[(0 > df['state_x']) & (df['state_x'] >= -ft), 'low'] = 2
    
    ################################################################
        
    df.to_csv('./temp/' + str(j) + 'x.csv', index=False)

    file_name = './temp/' + str(j) + 'x.csv'
    data_1_new = pd.read_csv(file_name)
    

    if buy == 0:
        for i in range(0, data_1_size):
            if data_1_new.loc[i, 'low'] == 1:
                data_1_new.loc[i, 'volume'] = data_1_new.loc[i, 'volume'] * -1
    else:
        for i in range(0, data_1_size):
            if data_1_new.loc[i, 'low'] == 0:
                data_1_new.loc[i, 'volume'] = data_1_new.loc[i, 'volume'] * -1
            if data_1_new.loc[i, 'low'] == 2:  ###### 过滤信号，2为不持仓
                data_1_new.loc[i, 'volume'] = 0
    

    data_1_new['high'] = data_1_new['volume'].cumsum()
    data_1_new['open'] = rrr * data_1_new['high'] / m
    
    
    ######################################################################################################

    ### 胜率
    wp_win = data_1_new['volume'] > 0
    wp_lost = data_1_new['volume'] < 0
    wp_nothing = data_1_new['volume'] == 0

    wp_win_a = wp_win.sum()            
    wp_lost_a = wp_lost.sum()
    wp_nothing_a = wp_nothing.sum()

    ### 盈亏比
    rrr_win = data_1_new.loc[wp_win, 'volume'].sum()
    rrr_lost = data_1_new.loc[wp_lost, 'volume'].sum()

    ##############################################################################################
    ###### 计算回撤数据
    data_1_new['cum_max_open'] = data_1_new['open'].cummax()  
    data_1_new['down'] = data_1_new['open'] - data_1_new['cum_max_open']  
    data_1_new['down'] = data_1_new['down'].clip(upper=0)  

    ##############################################################################################
    ###### 计算回撤面积
    downarea = data_1_new['down'].sum()

    ##############################################################################################
    ### 二级模型预留
    

    data_1_new['re'] = data_1_new['close'].pct_change() * 100
    data_1_new['real'] = (data_1_new['close'] >= data_1_new['close'].shift(1)).astype(int)
    data_1_new.loc[0, 'real'] = 0 

    if buy == 0:
        data_1_new['real_lab'] = np.where(
            data_1_new['low'] != data_1_new['real'], 
            'G', 'N'
        )
    else:
        data_1_new['real_lab'] = np.where(
            data_1_new['low'] == data_1_new['real'], 
            'G', 'N'
        )

    data_1_new.loc[0, 'real_lab'] = 'G' 

    file_name = './temp/Show.csv'
    df = pd.read_csv(file_name)        
    data_1_new['show'] = df['low']

    data_1_new['show_lab'] = np.where(
        data_1_new['low'] == data_1_new['show'], 
        'G', 'N'
    )

    data_1_new.loc[0, 'show_lab'] = 'G'  

    ##############################################################################################


    if sell == 0:
        data_1_new['re_real'] = np.where(
            data_1_new['low'] == 0, 
            -data_1_new['re'], 
            data_1_new['re']
        )
    else:
        data_1_new['re_real'] = np.where(
            data_1_new['low'] == 1, 
            -data_1_new['re'], 
            data_1_new['re']
        )

    data_1_new.loc[0, 're_real'] = 0  

    ##############################################################################################

    ###### 计算夏普比率和索提诺比率
    re_real = data_1_new['re_real'][1:]  
    mean_re = re_real.mean()
    std_re = re_real.std()
    sharpe = round(mean_re / std_re * 100 if std_re != 0 else 0, 4)

    neg_re = re_real[re_real < 0]
    std_neg_re = neg_re.std() if not neg_re.empty else 0
    sortino = round(mean_re / std_neg_re * 100 if std_neg_re != 0 else 0, 4)

    ##############################################################################################

    data_1_new.to_csv('./temp/' + str(j) + 'x.csv', index=False)

    ###### 计算最大回撤
    cum_max_open = data_1_new['open'].cummax()  ###### 累计最大值
    drawdown = cum_max_open - data_1_new['open']  ###### 回撤值
    s = np.argmax(drawdown)  ###### 最大回撤结束位置

    ###### 确定最大回撤开始位置
    if s == 0:
        e = 0
    else:
        e = np.argmax(data_1_new['open'].iloc[:s]) 

    maxdrawdown = data_1_new['open'].iloc[e] - data_1_new['open'].iloc[s]  ###### 最大回撤
    drawdown_days = s - e  ###### 回撤持续周期数
    
    
    start_DAY = data_1_new.index[s] ######开始回撤的日期
    end_DAY = data_1_new.index[e] ######结束回撤的日期
    start_net_value = data_1_new[data_1_new.index == start_DAY]['open'].values[0] ######开始回撤的净值
    end_net_value = data_1_new[data_1_new.index == end_DAY]['open'].values[0] ######结束回撤的净值
    fig=plt.figure(figsize=(20,11))  
    plt.plot(data_1_new['eob'], data_1_new['open'])
    plt.plot([start_DAY, end_DAY], [start_net_value, end_net_value], linestyle='--', color='r')

    plt.xticks(range(0,data_1_size,int(data_1_size/m_size))) 

    plt.legend(['All:' + str(round(data_1_new['open'].iloc[-1]*100,2)) + '%' +
                '   ' + str(m_size) + 'm'
                '   Year:'+ str(round(data_1_new['open'].iloc[-1]/m_size*100*12,2)) + '%' +
                '   CalmarY:'+ str(round((data_1_new['open'].iloc[-1]/m_size*100*12)/(maxdrawdown*100),2)) +
                '   WP:' + str(round(wp_win_a/(wp_win_a + wp_lost_a)*100,2)) + '%' +
                '   RRR:' + str(round(rrr_win/(rrr_win+abs(rrr_lost))*100,2)) + '%' + ' / ' + str(round(rrr_win/abs(rrr_lost),2)) +
                '   T/N:' + str(wp_win_a + wp_lost_a ) + ' / ' + str(wp_nothing_a) +
                '   Sharpe:' + str(sharpe) +
                '   Sortino:' + str(sortino) +
                '   MSE:' + str(round(metrics_df.loc[metrics_df['Metric'] == 'MSE', 'Mean'].values[0], 4)) +
                '   RMSE:' + str(round(metrics_df.loc[metrics_df['Metric'] == 'RMSE', 'Mean'].values[0], 4)) +
                '   R2:' + str(round(metrics_df.loc[metrics_df['Metric'] == 'R2', 'Mean'].values[0], 4)),

                'MD:'+ str(round(maxdrawdown*100,2)) + '%' +
                '   DA:'+ str(round(downarea,4)) + '%' +
                '   MDT:' + str(drawdown_days)+
                '   Date:' + str(data_1_new['eob'].iloc[e]) + ' - ' + str(data_1_new['eob'].iloc[s])] ,

                loc='upper left',fontsize = 11)
    
    plt.plot(data_1_new['eob'], data_1_new['down'], color='#ec700a')
    plt.fill_between(data_1_new['eob'], data_1_new['down'], 0, where=(data_1_new['down']<0), facecolor='#FF0000', alpha=0.1)
    plt.xticks(range(0,data_1_size,int(data_1_size/m_size)))

    fig.autofmt_xdate()
    plt.grid(1)
    plt.savefig("./temp/" + str(j) + "sy.jpg")
    plt.close()

    fig=plt.figure(figsize=(20,10))  
    plt.plot(data_1_new['eob'], data_1_new['high'])
    plt.xticks(range(0,data_1_size,int(data_1_size/m_size)))
    fig.autofmt_xdate()
    plt.grid(1)
    plt.savefig("./temp/" + str(j) + "p.jpg")
    plt.close()
    
    ##############################################################################################
        
    pp = metrics_df.loc[metrics_df['Metric'] == 'MSE', 'Mean'].values[0]
    resP.append({
        'MSE_no': j,
        'min_MAE': pp
    })
    
    rr = metrics_df.loc[metrics_df['Metric'] == 'RMSE', 'Mean'].values[0]
    resR.append({
        'RMSE_no': j,
        'min_RMSE': rr
    })
    
    ff = metrics_df.loc[metrics_df['Metric'] == 'R2', 'Mean'].values[0]
    resF.append({
        'R2_no': j,
        'max_R2': ff
    })
    
    ##############################################################################################

    max_all = round(data_1_new['open'].iloc[-1]*100,2)
    max_no = j

    res1.append({
        'All_no': max_no,
        'max_All': max_all
    })

    max_CalmarY = round((data_1_new['open'].iloc[-1]/m_size*100*12)/(maxdrawdown*100),2)
    
    res2.append({
        'CalmarY_no': max_no,
        'max_CalmarY': max_CalmarY
    })
    
    res3.append({
        'Downarea_no': max_no,
        'min_Downarea': downarea
    })
          
    max_wp = round(wp_win_a/(wp_win_a + wp_lost_a)*100,2)
    
    res4.append({
        'WP_no': max_no,
        'max_WP': max_wp
    })
    
    max_rrr = round(rrr_win/(rrr_win+abs(rrr_lost))*100,2)
    
    res5.append({
        'RRR_no': max_no,
        'max_RRR': max_rrr
    })
    
    res6.append({
        'Sharpe_no': max_no,
        'max_Sharpe': sharpe
    })
        
    res7.append({
        'Sortino_no': max_no,
        'max_Sortino': sortino
    })
    

   ##############################################################################################

aaaP = pd.DataFrame(resP)
aaaR = pd.DataFrame(resR)
aaaF = pd.DataFrame(resF)

bbbP = aaaP.sort_values(by="min_MAE",ascending=True)     ### 由小到大排序
bbbR = aaaR.sort_values(by="min_RMSE",ascending=True)    ### 由小到大排序
bbbF = aaaF.sort_values(by="max_R2",ascending=False)     ### 由大到小排序

bbbP = bbbP.reset_index(drop=True)
bbbR = bbbR.reset_index(drop=True)
bbbF = bbbF.reset_index(drop=True)

bbbP['RMSE_no'] = bbbR['RMSE_no']
bbbP['min_RMSE'] = bbbR['min_RMSE']
bbbP['R2_no'] = bbbF['R2_no']
bbbP['max_R2'] = bbbF['max_R2']

bbbP.to_csv("./temp/Best_2.csv",index = False)

   ##############################################################################################

aaa1 = pd.DataFrame(res1)
aaa2 = pd.DataFrame(res2)
aaa3 = pd.DataFrame(res3)
aaa4 = pd.DataFrame(res4)
aaa5 = pd.DataFrame(res5)
aaa6 = pd.DataFrame(res6)
aaa7 = pd.DataFrame(res7)

bbb1 = aaa1.sort_values(by="max_All",ascending=False)       ### 由大到小排序
bbb2 = aaa2.sort_values(by="max_CalmarY",ascending=False)    
bbb3 = aaa3.sort_values(by="min_Downarea",ascending=False)     
bbb4 = aaa4.sort_values(by="max_WP",ascending=False)    
bbb5 = aaa5.sort_values(by="max_RRR",ascending=False)    
bbb6 = aaa6.sort_values(by="max_Sharpe",ascending=False)    
bbb7 = aaa7.sort_values(by="max_Sortino",ascending=False)   

bbb1 = bbb1.reset_index(drop=True)
bbb2 = bbb2.reset_index(drop=True)
bbb3 = bbb3.reset_index(drop=True)
bbb4 = bbb4.reset_index(drop=True)
bbb5 = bbb5.reset_index(drop=True)
bbb6 = bbb6.reset_index(drop=True)
bbb7 = bbb7.reset_index(drop=True)

bbb1['CalmarY_no'] = bbb2['CalmarY_no']
bbb1['max_CalmarY'] = bbb2['max_CalmarY']
bbb1['Downarea_no'] = bbb3['Downarea_no']
bbb1['min_Downarea'] = bbb3['min_Downarea']
bbb1['WP_no'] = bbb4['WP_no']
bbb1['max_WP'] = bbb4['max_WP']
bbb1['RRR_no'] = bbb5['RRR_no']
bbb1['max_RRR'] = bbb5['max_RRR']
bbb1['Sharpe_no'] = bbb6['Sharpe_no']
bbb1['max_Sharpe'] = bbb6['max_Sharpe']
bbb1['Sortino_no'] = bbb7['Sortino_no']
bbb1['max_Sortino'] = bbb7['max_Sortino']

bbb1.to_csv("./temp/Best_1.csv",index = False)