# 1.data introduction

In [1]:
import polars as pl
import time
import numpy as np

In [3]:
lu_data_pl = pl.read_parquet('lu_total.parquet')
lu_kbar_pl = pl.read_parquet('lu_kabr_total.parquet')

**The original data is from a public source, Esunny 9.5 or Eploestar9.5. The data columns are shown below. The instrument is a commodity named low-sulfur fuel oil.Its tag in SHFE(or INE) is LU. The data is intraday and ranged from 2024/11/22 to 2024/12/3.**

In [51]:
lu_data_pl.glimpse()

Rows: 256873
Columns: 70
$ ContractNo                 <str> 'INE|F|LU|2503', 'INE|F|LU|2503', 'INE|F|LU|2503', 'INE|F|LU|2503', 'INE|F|LU|2503', 'INE|F|LU|2503', 'INE|F|LU|2503', 'INE|F|LU|2503', 'INE|F|LU|2503', 'INE|F|LU|2503'
$ UpdateTime                 <str> '20241122210001500', '20241122210001500', '20241122210002000', '20241122210002000', '20241122210002500', '20241122210002500', '20241122210003000', '20241122210003000', '20241122210003500', '20241122210003500'
$ PreClosingPrice            <f64> 3971.0, 3971.0, 3971.0, 3971.0, 3971.0, 3971.0, 3971.0, 3971.0, 3971.0, 3971.0
$ PreSettlePrice             <f64> 3972.0, 3972.0, 3972.0, 3972.0, 3972.0, 3972.0, 3972.0, 3972.0, 3972.0, 3972.0
$ PrePositionQty             <i64> 30535, 30535, 30535, 30535, 30535, 30535, 30535, 30535, 30535, 30535
$ OpeningPrice               <f64> 3959.0, 3959.0, 3959.0, 3959.0, 3959.0, 3959.0, 3959.0, 3959.0, 3959.0, 3959.0
$ LastPrice                  <f64> 3939.0, 3939.0, 3942.0, 3942.0, 3943.0, 3943.0

In [52]:
lu_kbar_pl.glimpse()

Rows: 50422
Columns: 20
$ ContractNo                 <str> 'INE|F|LU|2503', 'INE|F|LU|2503', 'INE|F|LU|2503', 'INE|F|LU|2503', 'INE|F|LU|2503', 'INE|F|LU|2503', 'INE|F|LU|2503', 'INE|F|LU|2503', 'INE|F|LU|2503', 'INE|F|LU|2503'
$ KLineType                  <str> 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T'
$ KLineSlice                 <i64> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
$ TradeDate                  <str> '20241122', '20241125', '20241125', '20241125', '20241125', '20241125', '20241125', '20241125', '20241125', '20241125'
$ DateTimeStamp              <str> '20241122145950500', '20241122210000500', '20241122210001000', '20241122210001500', '20241122210002000', '20241122210002500', '20241122210003000', '20241122210003500', '20241122210004000', '20241122210004500'
$ TotalQty                   <i64> 15783, 114, 141, 187, 204, 219, 248, 260, 268, 275
$ PositionQty                <i64> 30535, 30614, 30636, 30639, 30647, 30648, 30644, 30649, 30651, 30653
$ LastPrice                  <f64> 3

# 2.factors construction and feature engineering 

**The next step is feature engineering which may contain sensitive information. The feature enginneering part is censored here. But I can give a breif introduction about the high frequency factors used. The factors can be mainly described as several types, factors of order book structures, factors of order flow analysis, factors of trend, factors of volatility and some trivIal factors as suppliment. The data after processing contains 289 columns. There are 287 features as our X and 2 target features as our Y.**  

**The target feature are the mean and standard deviation of price distribution in a short time future.**
  
The data after process is as follows

In [53]:
data = pl.read_parquet('data.parquet')

In [55]:
data.columns

['LastPrice',
 'midprice',
 'factor0',
 'factor1',
 'factor2',
 'factor3',
 'factor4',
 'factor5',
 'factor6',
 'factor7',
 'factor8',
 'factor9',
 'factor10',
 'factor11',
 'factor12',
 'factor13',
 'factor14',
 'factor15',
 'factor16',
 'factor17',
 'factor18',
 'factor19',
 'factor20',
 'factor21',
 'factor22',
 'factor23',
 'factor24',
 'factor25',
 'factor26',
 'factor27',
 'factor28',
 'factor29',
 'factor30',
 'factor31',
 'factor32',
 'factor33',
 'factor34',
 'factor35',
 'factor36',
 'factor37',
 'factor38',
 'factor39',
 'factor40',
 'factor41',
 'factor42',
 'factor43',
 'factor44',
 'factor45',
 'factor46',
 'factor47',
 'factor48',
 'factor49',
 'factor50',
 'factor51',
 'factor52',
 'factor53',
 'factor54',
 'factor55',
 'factor56',
 'factor57',
 'factor58',
 'factor59',
 'factor60',
 'factor61',
 'factor62',
 'factor63',
 'factor64',
 'factor65',
 'factor66',
 'factor67',
 'factor68',
 'factor69',
 'factor70',
 'factor71',
 'factor72',
 'factor73',
 'factor74',
 'factor

# 3.standardize the data

**define the data standardizer**

In [56]:
from sklearn.preprocessing import StandardScaler, RobustScaler

class datapreprocessor:
    def __init__(self,data,process_list):
        self.raw_data = data
        self.need_processing_list = process_list

        self.scaler_dict = {}
        self.scaled_df = pl.DataFrame()
        
    def training_scale(self):
        _i = 0
        for tag in self.raw_data.columns:
            if tag in self.need_processing_list:
                self.scaler_dict[tag] = StandardScaler()
                _series = np.expand_dims(self.raw_data[tag],axis=1)
                self.scaler_dict[tag].fit(_series)
                self.scaled_df = self.scaled_df.insert_column(
                    _i,pl.Series(tag, np.squeeze(self.scaler_dict[tag].transform(_series),axis = 1))
                )
            else:
                self.scaled_df = self.scaled_df.insert_column(
                    _i,pl.Series(tag, self.raw_data[tag].alias(tag) )
                )
            _i += 1
    
    def inferrence_scale(self):
        pass
    
    def reverse_scale(self):
        pass


**standardize data**

In [62]:
sd_list = ['LastPrice','midprice', 'factor1', 'factor2', 'factor3', 'factor4', 'factor5', 'factor6', 'factor7', 'factor8', 'factor9', 'factor10', 'factor11', 'factor12', 'factor13', 'factor14', 'factor15', 'factor16', 'factor17', 'factor18', 'factor19', 'factor20', 'factor21', 'factor22', 'factor24', 'factor25', 'factor26', 'factor29', 'factor30', 'factor31', 'factor32', 'factor33', 'factor34', 'factor35', 'factor36', 'factor37', 'factor38', 'factor39', 'factor40', 'factor41', 'factor42', 'factor43', 'factor44', 'factor45', 'factor46', 'factor47', 'factor48', 'factor49', 'factor50', 'factor51', 'factor52', 'factor53', 'factor54', 'factor55', 'factor56', 'factor57', 'factor58', 'factor59', 'factor60', 'factor61', 'factor62', 'factor63', 'factor64', 'factor65', 'factor66', 'factor67', 'factor68', 'factor69', 'factor70', 'factor71', 'factor72', 'factor73', 'factor74', 'factor75', 'factor76', 'factor77', 'factor78', 'factor79', 'factor80', 'factor81', 'factor82', 'factor83', 'factor84', 'factor85', 'factor86', 'factor87', 'factor88', 'factor89', 'factor90', 'factor91', 'factor92', 'factor93', 'factor94', 'factor95', 'factor96', 'factor97', 'factor98', 'factor99', 'factor100', 'factor101', 'factor102', 'factor103', 'factor104', 'factor105', 'factor106', 'factor107', 'factor108', 'factor109', 'factor110', 'factor111', 'factor112', 'factor113', 'factor114', 'factor115', 'factor116', 'factor117', 'factor118', 'factor119', 'factor120', 'factor121', 'factor122', 'factor123', 'factor124', 'factor125', 'factor126', 'factor127', 'factor128', 'factor129', 'factor130', 'factor131', 'factor132', 'factor133', 'factor134', 'factor135', 'factor136', 'factor137', 'factor138', 'factor139', 'factor140', 'factor141', 'factor142', 'factor143', 'factor144', 'factor145', 'factor146', 'factor147', 'factor148', 'factor149', 'factor150', 'factor151', 'factor152', 'factor153', 'factor154', 'factor155', 'factor156', 'factor157', 'factor158', 'factor159', 'factor160', 'factor161', 'factor162', 'factor163', 'factor164', 'factor165', 'factor166', 'factor167', 'factor168', 'factor169', 'factor170', 'factor171', 'factor172', 'factor173', 'factor174', 'factor175', 'factor176', 'factor177', 'factor178', 'factor179', 'factor180', 'factor181', 'factor182', 'factor183', 'factor184', 'factor185', 'factor186', 'factor187', 'factor188', 'factor189', 'factor190', 'factor191', 'factor192', 'factor193', 'factor194', 'factor195', 'factor196', 'factor197', 'factor198', 'factor199', 'factor200', 'factor201', 'factor202', 'factor203', 'factor204', 'factor205', 'factor206', 'factor207', 'factor208', 'factor209', 'factor210', 'factor211', 'factor212', 'factor213', 'factor214', 'factor215', 'factor216', 'factor217', 'factor218', 'factor219', 'factor220', 'factor221', 'factor222', 'factor223', 'factor224', 'factor225', 'factor226', 'factor227', 'factor228', 'factor229', 'factor230', 'factor231', 'factor232', 'factor233', 'factor234', 'factor235', 'factor236', 'factor237', 'factor238', 'factor239', 'factor240', 'factor241', 'factor242', 'factor243', 'factor244', 'factor245', 'factor246', 'factor247', 'factor248', 'factor249', 'factor250', 'factor251', 'factor252', 'factor253', 'factor254', 'factor255', 'factor256', 'factor257', 'factor258', 'factor259', 'factor260', 'factor261', 'factor262', 'factor263', 'factor264', 'factor265', 'factor266', 'factor267', 'factor268', 'factor269', 'factor270', 'factor271', 'factor272', 'factor273', 'factor274', 'factor275', 'factor276', 'factor277', 'factor278', 'factor279', 'factor280', 'factor281', 'factor282', 'factor283', 'factor284','traget_mean','traget_std']

In [64]:
dfprocessor = datapreprocessor(data,sd_list)
dfprocessor.training_scale()

In [66]:
standardized_data = dfprocessor.scaled_df

**Save the standardized data and scaler**

In [67]:
standardized_data.write_parquet('standardized_data.parquet')

In [70]:
import pickle

def save_scaler_pickle(scaler, filename):
    """Save the scaler to a file using pickle."""
    try:
        with open(filename, 'wb') as f:
            pickle.dump(scaler, f)
        print(f"Scaler saved successfully to {filename}")
    except Exception as e:
        print(f"Error saving scaler: {e}")


In [71]:
save_scaler_pickle(dfprocessor.scaler_dict,'scaler_dict.pkl')

Scaler saved successfully to scaler_dict.pkl
