### This is a simple LGB baseline. You can work for feature engineering.
### The seed is 42, which will bring good luck!


In [1]:
import os
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
#import lightgbm as lgb
import time
import datetime
from numba import jit
#from lightgbm import LGBMRegressor
from multiprocessing import Pool
from sklearn.model_selection import TimeSeriesSplit

import pickle
import gc

from tqdm import tqdm

n_fold = 10
group_gap = 31
seed = 42

#Chris' paths:
#TRAIN_MARKET_PATH = '/content/drive/MyDrive/hku_qis/hku-qids-2023-quantitative-investment-competition/first_round_train_market_data.csv'
#TRAIN_FUNADMENTAL_PATH = '/content/drive/MyDrive/hku_qis/hku-qids-2023-quantitative-investment-competition/first_round_train_fundamental_data.csv'
#TRAIN_RETURN_PATH = '/content/drive/MyDrive/hku_qis/hku-qids-2023-quantitative-investment-competition/first_round_train_return_data.csv'

#TEST_MARKET_PATH = '/content/drive/MyDrive/hku_qis/hku-qids-2023-quantitative-investment-competition/first_round_test_market_data.csv'
#TEST_FUNADMENTAL_PATH = '/content/drive/MyDrive/hku_qis/hku-qids-2023-quantitative-investment-competition/first_round_test_fundamental_data.csv'

#Freya's paths:
TRAIN_MARKET_PATH = '/Users/75717/Downloads/273_Washu/first_round_train_market_data.csv'
TRAIN_FUNADMENTAL_PATH = '/Users/75717/Downloads/273_Washu/first_round_train_fundamental_data.csv'
TRAIN_RETURN_PATH = '/Users/75717/Downloads/273_Washu/first_round_train_return_data.csv'

TEST_MARKET_PATH = '/Users/75717/Downloads/273_Washu/first_round_test_market_data.csv'
TEST_FUNADMENTAL_PATH = '/Users/75717/Downloads/273_Washu/first_round_test_fundamental_data.csv'

#Cynthia's paths:
#TRAIN_MARKET_PATH = '/content/drive/MyDrive/hku_qis/hku-qids-2023-quantitative-investment-competition/first_round_train_market_data.csv'
#TRAIN_FUNADMENTAL_PATH = '/content/drive/MyDrive/hku_qis/hku-qids-2023-quantitative-investment-competition/first_round_train_fundamental_data.csv'
#TRAIN_RETURN_PATH = '/content/drive/MyDrive/hku_qis/hku-qids-2023-quantitative-investment-competition/first_round_train_return_data.csv'

#TEST_MARKET_PATH = '/content/drive/MyDrive/hku_qis/hku-qids-2023-quantitative-investment-competition/first_round_test_market_data.csv'
#TEST_FUNADMENTAL_PATH = '/content/drive/MyDrive/hku_qis/hku-qids-2023-quantitative-investment-competition/first_round_test_fundamental_data.csv'


pd.set_option('display.max_rows', 6)
pd.set_option('display.max_columns', 350)

In [2]:
#read data
df_train_market = pd.read_csv(TRAIN_MARKET_PATH)
df_train_return = pd.read_csv(TRAIN_RETURN_PATH)
df_train_fundamental = pd.read_csv(TRAIN_FUNADMENTAL_PATH)

df_test_market = pd.read_csv(TEST_MARKET_PATH)
df_test_fundamental = pd.read_csv(TEST_FUNADMENTAL_PATH)


In [3]:
#merge train dataset and test dataset
def split_time(x):
    df1 = x['date_time'].str.split('d', expand=True)
    df1.columns=['code','s']
    code = df1['code']
    df1 = df1['s'].str.split('p', expand=True)
    df1.columns=['day','time_step']
    df2 = x['date_time'].str.rsplit('p', expand=True)
    df2.columns=['day_s','s']
    df1['day_s'] = df2['day_s']
    df1['code'] = code
    x = pd.concat([x,df1],axis=1)
    
    return x

df_train_market = split_time(df_train_market)
df = pd.merge(df_train_fundamental,df_train_market, left_on='date_time',right_on='day_s')  
df = pd.merge(df,df_train_return, left_on='day_s',right_on='date_time')  

df_test_market = split_time(df_test_market)
test = pd.merge(df_test_fundamental,df_test_market, left_on='date_time',right_on='day_s')  


In [4]:
#drop duplicates
df = df.drop_duplicates(subset='day_s', keep='last').reset_index(drop=True)
test = test.drop_duplicates(subset='day_s', keep='last').reset_index(drop=True)

In [5]:
def correlation(a, train_data):
    
    b = train_data.get_label()
    
    a = np.ravel(a)
    b = np.ravel(b)

    len_data = len(a)
    mean_a = np.sum(a) / len_data
    mean_b = np.sum(b) / len_data
    var_a = np.sum(np.square(a - mean_a)) / len_data
    var_b = np.sum(np.square(b - mean_b)) / len_data

    cov = np.sum((a * b))/len_data - mean_a*mean_b
    corr = cov / np.sqrt(var_a * var_b)

    return 'corr', corr, True

# For CV score calculation
def corr_score(pred, valid):
    len_data = len(pred)
    mean_pred = np.sum(pred) / len_data
    mean_valid = np.sum(valid) / len_data
    var_pred = np.sum(np.square(pred - mean_pred)) / len_data
    var_valid = np.sum(np.square(valid - mean_valid)) / len_data

    cov = np.sum((pred * valid))/len_data - mean_pred*mean_valid
    corr = cov / np.sqrt(var_pred * var_valid)

    return corr

# For CV score calculation
def wcorr_score(pred, valid, weight):
    len_data = len(pred)
    sum_w = np.sum(weight)
    mean_pred = np.sum(pred * weight) / sum_w
    mean_valid = np.sum(valid * weight) / sum_w
    var_pred = np.sum(weight * np.square(pred - mean_pred)) / sum_w
    var_valid = np.sum(weight * np.square(valid - mean_valid)) / sum_w

    cov = np.sum((pred * valid * weight)) / sum_w - mean_pred*mean_valid
    corr = cov / np.sqrt(var_pred * var_valid)

    return corr

In [6]:
df.columns

Index(['date_time_x', 'turnoverRatio', 'transactionAmount', 'pe_ttm', 'pe',
       'pb', 'ps', 'pcf', 'date_time_y', 'open', 'close', 'high', 'low',
       'volume', 'money', 'day', 'time_step', 'day_s', 'code', 'date_time',
       'return'],
      dtype='object')

In [7]:
test.columns

Index(['date_time_x', 'turnoverRatio', 'transactionAmount', 'pe_ttm', 'pe',
       'pb', 'ps', 'pcf', 'date_time_y', 'open', 'close', 'high', 'low',
       'volume', 'money', 'day', 'time_step', 'day_s', 'code'],
      dtype='object')

In [18]:
#for normalizing data
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0, 1))
#because these time variables are not number, they cannot be scaled
col_train=[i for i in df.columns if i not in ['date_time_x', 'date_time_y', 'day', 'time_step', 'day_s', 'code', 'date_time','return']]
timer_train=df.loc[:,['date_time_x', 'date_time_y', 'day', 'time_step', 'day_s', 'code', 'date_time','return']]
#scale those x variables in training dataset
scaled_df=scaler.fit_transform(df[col_train])
scaled_df=pd.DataFrame(scaled_df,columns=['turnoverRatio', 'transactionAmount', 'pe_ttm', 'pe', 'pb', 'ps', 'pcf','open', 'close', 'high', 'low', 'volume', 'money'])
#add timer back to the df
new_df=pd.merge(scaled_df,timer_train,how='outer',left_index=True,right_index=True)

#same process for test dataset
col_test=[i for i in test.columns if i not in ['date_time_x', 'date_time_y', 'day', 'time_step', 'day_s', 'code', 'date_time']]
timer_test=test.loc[:,['date_time_x', 'date_time_y', 'day', 'time_step', 'day_s', 'code']]
scaled_test=scaler.fit_transform(test[col_test])
scaled_test=pd.DataFrame(scaled_test,columns=['turnoverRatio', 'transactionAmount', 'pe_ttm', 'pe', 'pb', 'ps', 'pcf','open', 'close', 'high', 'low', 'volume', 'money'])
new_test=pd.merge(scaled_test,timer_test,how='outer',left_index=True,right_index=True)

In [21]:
new_df

Unnamed: 0,turnoverRatio,transactionAmount,pe_ttm,pe,pb,ps,pcf,open,close,high,low,volume,money,date_time_x,date_time_y,day,time_step,day_s,code,date_time,return
0,0.093616,0.051792,0.025331,0.046411,0.163548,0.083384,0.948034,0.024360,0.024333,0.024332,0.024362,0.004363,0.009410,s0d1,s0d1p50,1,50,s0d1,s0,s0d1,-0.026877
1,0.063802,0.011141,0.025208,0.046202,0.168140,0.065664,0.951514,0.015426,0.015374,0.015532,0.015375,0.001816,0.002596,s1d1,s1d1p50,1,50,s1d1,s1,s1d1,-0.052674
2,0.032330,0.015439,0.025523,0.046811,0.158080,0.090780,0.951040,0.007680,0.007667,0.007667,0.007681,0.002155,0.001721,s2d1,s2d1p50,1,50,s2d1,s2,s2d1,-0.002691
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53889,0.028856,0.015569,0.024882,0.045675,0.019734,0.011614,0.951489,0.001900,0.001897,0.001897,0.001900,0.004846,0.001592,s51d998,s51d998p50,998,50,s51d998,s51,s51d998,-0.052286
53890,0.013962,0.022720,0.025208,0.046273,0.197897,0.081012,0.948013,0.029243,0.029194,0.029194,0.029245,0.001526,0.003895,s52d998,s52d998p50,998,50,s52d998,s52,s52d998,-0.015559
53891,0.032522,0.029102,0.024792,0.045511,0.043472,0.004854,0.952153,0.008670,0.008655,0.008655,0.008671,0.004737,0.004162,s53d998,s53d998p50,998,50,s53d998,s53,s53d998,-0.003662


In [9]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM

In [13]:
def train_and_evaluate(train,test):
    # Hyperparammeters (just basic)
    params = {
      'objective': 'rmse',  
      'boosting_type': 'gbdt',
      'n_jobs': -1,
      'verbose': -1
    }
    
    # Split features and target
    
    x = train[[i for i in df.columns if i not in ['date_time_x', 'date_time_y', 'day', 'time_step', 'day_s', 'code', 'date_time','return']]]
    y = train['return']
    
    x_test = test[[i for i in df.columns if i not in ['date_time_x', 'date_time_y', 'day', 'time_step', 'day_s', 'code', 'date_time','return']]]

    oof_predictions = np.zeros(x.shape[0])
    test_predictions = np.zeros(x_test.shape[0])
    scores = []

    # Create a KFold object
    gkf = TimeSeriesSplit(n_splits=n_fold,gap=group_gap)
    for fold, (trn_ind, val_ind) in enumerate(gkf.split(train['day'].values)):
    
        print(f'Training fold {fold + 1}')
        x_train, x_val = x.iloc[trn_ind], x.iloc[val_ind]
        y_train, y_val = y.iloc[trn_ind], y.iloc[val_ind]
        
        #这下面的用到lgb了
        train_dataset = lgb.Dataset(x_train, y_train)
        val_dataset = lgb.Dataset(x_val, y_val)
        model = lgb.train(params = params, 
                          train_set = train_dataset, 
                          valid_sets = [train_dataset, val_dataset], 
                          num_boost_round = 200, 
                          early_stopping_rounds = 20, 
                          verbose_eval = False,
                          feval = correlation)
        # Add predictions to the out of folds array
        
        oof_predictions[val_ind] = model.predict(x_val)
        
        rmspe_score = corr_score(y_val,oof_predictions[val_ind])
        print(f'Our out of folds corr_score is {rmspe_score}')
        scores.append(rmspe_score)
        test_predictions += model.predict(x_test) 
        
    rmspe_score = corr_score(y, oof_predictions)
    print(scores)
    print(f'Our out of folds corr score is {rmspe_score}')
    
    # Return test predictions
    return test_predictions

In [24]:
def train_and_evaluate(train,test):
    # Hyperparammeters (just basic)
    params = {
      'objective': 'rmse',  
      'boosting_type': 'gbdt',
      'n_jobs': -1,
      'verbose': -1
    }
    
    # Split features and target
    
    x = train[[i for i in df.columns if i not in ['date_time_x', 'date_time_y', 'day', 'time_step', 
                                                  'day_s', 'code', 'date_time','return']]]
    y = train['return']
    
    x_test = test[[i for i in df.columns if i not in ['date_time_x', 'date_time_y', 'day', 'time_step', 
                                                      'day_s', 'code', 'date_time','return']]]

    oof_predictions = np.zeros(x.shape[0])
    test_predictions = np.zeros(x_test.shape[0])
    scores = []

    # Create a KFold object
    gkf = TimeSeriesSplit(n_splits=n_fold,gap=group_gap)
    for fold, (trn_ind, val_ind) in enumerate(gkf.split(train['day'].values)):
    
        print(f'Training fold {fold + 1}')
        x_train, x_val = x.iloc[trn_ind], x.iloc[val_ind]
        y_train, y_val = y.iloc[trn_ind], y.iloc[val_ind]
        
        # create and fit the LSTM network
        model = Sequential()
        model.add(LSTM(units=50, return_sequences=True, input_shape=(x_train.shape[1],1)))
        model.add(LSTM(units=50))
        model.add(Dense(1))
 
        model.compile(loss='mean_squared_error', optimizer='adam', metrics='mae')
        model.fit(x_train, y_train, epochs=1, batch_size=1, verbose=2,validation_data=(x_val,y_val))
        
        #This train_pred must have the same shape as the dataset on which you fitted the scaler. 
        #To do the inverse_transform you can extract the needed attributes from your scaler 
        #and apply them to your prediction.
        scaler_pred=MinMaxScaler()
        scaler_pred.min_, scaler_pred.scale_ = scaler.min_[1], scaler.scale_[1]
        test_predictions += scaler_pred.inverse_transform(model.predict(x_test)) 
        
    rmspe_score = corr_score(y, oof_predictions)
    print(scores)
    print(f'Our out of folds corr score is {rmspe_score}')
    
    # Return test predictions
    return test_predictions

In [25]:
test_predictions = train_and_evaluate(new_df,new_test)

Training fold 1
4871/4871 - 94s - loss: 0.0017 - mae: 0.0307 - val_loss: 0.0076 - val_mae: 0.0667 - 94s/epoch - 19ms/step
Training fold 2
9770/9770 - 168s - loss: 0.0046 - mae: 0.0488 - val_loss: 0.0022 - val_mae: 0.0351 - 168s/epoch - 17ms/step


MemoryError: Unable to allocate 10.6 GiB for an array with shape (37800, 37800) and data type float64

In [None]:
# Save test predictions
test['return'] = test_predictions

prediction = test[['date_time_x','return']]
prediction.columns=['date_time','return']
prediction.to_csv('submission.csv',index = False)

In [None]:
import random 
random.seed(20230206)

SUBMISSION_PATH = '/kaggle/working/submission.csv'

POINT_PER_DAY = 50

class QIDS:
    def __init__(self) -> None:
        self.__submission_path = SUBMISSION_PATH
        self.__current_idx = 0
        self.__predict_idx = 0
        self.__num_of_stocks = 54
        self.__point_per_day = POINT_PER_DAY
        self.__end = False
        self.__current_fundamental_df = None

        self.__fundamental_df = pd.read_csv(TEST_FUNADMENTAL_PATH)
        self.__market_df = pd.read_csv(TEST_MARKET_PATH)
        
        if len(self.__fundamental_df) / self.__num_of_stocks != len(self.__market_df)/ self.__num_of_stocks / self.__point_per_day:
            raise ValueError('The length of fundamental data and market data is not equal.')
        self.__length = len(self.__fundamental_df) / self.__num_of_stocks

        with open(self.__submission_path, 'w') as f:
            f.write('date_time,return\n') 
        
        print('Environment is initialized.')
    
    def is_end(self):
        return self.__end

    # return the fun
    def get_current_market(self):
        if self.__end:
            raise ValueError('The environment has ended.')

        # check if the current index is equal to the predict index
        if self.__current_idx != self.__predict_idx:
            raise ValueError('The current index is not equal to the predict index.')

        # load data of the current day
        fundamental_df = self.__fundamental_df.iloc[self.__current_idx * self.__num_of_stocks: (self.__current_idx + 1) * self.__num_of_stocks]
        market_df = self.__market_df.iloc[self.__current_idx * self.__num_of_stocks * self.__point_per_day: (self.__current_idx + 1) * self.__num_of_stocks * self.__point_per_day]
        
        # update the current index
        self.__current_idx += 1
        self.__current_fundamental_df = fundamental_df.reset_index()
        
        return fundamental_df, market_df

    def input_prediction(self, predict_ds: pd.Series):
        if self.__end:
            raise ValueError('The environment has ended.')

        # check if the current index is equal to the predict index plus 1
        if self.__current_idx != self.__predict_idx + 1:
            raise ValueError('The current index is not equal to the predict index plus 1.')

        # check the length of the predict_ds
        if len(predict_ds) != self.__num_of_stocks:
            raise ValueError('The length of input decisions is wrong.')
        
        # check the type of the predict_ds
        if type(predict_ds) != pd.Series:
            raise TypeError('The type of input decisions is wrong.')
        
        # write the prediction to the submission file
        with open(self.__submission_path, 'a') as f:
            for idx in range(len(predict_ds)):
                f.write(f"{str(self.__current_fundamental_df['date_time'][idx])},{str(predict_ds.iloc[idx])}\n")

                # must follow the stock order
                # f.write(f"s{idx}d{self.__current_idx},{str(predict_ds.iloc[idx])}\n")
        
        self.__predict_idx += 1
        if self.__predict_idx == self.__length:
            self.__end = True
            print('Data Feeding is finished.')
        

# initialize the environment
def make_env():
    if random.random() == 0.8396457911824297:
        return QIDS()
    else:
        raise ImportError('You cannot make this environment twice.')

In [None]:
#from qids_package.qids import *

env = make_env()

import random 
random.seed(42)

while not env.is_end():
	fundamental_df, market_df = env.get_current_market()
	
	l = []
	for idx in range(54):
		l.append(random.random())
	predict_ds =pd.Series(1)
	
	env.input_prediction(predict_ds)