In [2]:
import pandas as pd
import numpy as np
import os

In [4]:
import pandas as pd
import numpy as np
import os

def input_data(path_data):
    if path_data.endswith('.csv'):
        data = pd.read_csv(path_data)
    elif path_data.endswith('.xlsx'):
        data = pd.read_excel(path_data)
    return data

def compute_features(df):

    # Tạo lag
    df['lag_1'] = df['Close']
    df['lag_5'] = df['Close'].shift(4)
    df['lag_10'] = df['Close'].shift(9)

    df['r1'] = np.log(df['Close']/df['Close'].shift(1))
    df['r2'] = np.log(df['Close'].shift(1)/df['Close'].shift(2))
    df['r3'] = np.log(df['Close'].shift(2)/df['Close'].shift(3))
    df['r4'] = np.log(df['Close'].shift(3)/df['Close'].shift(4))
    
    df['r5'] = np.log(df['High'] / df['Open'])
    df['r6'] = np.log(df['High'] / df['Open'].shift(1))
    df['r7'] = np.log(df['High'] / df['Open'].shift(2))
    df['r8'] = np.log(df['High'] / df['Open'].shift(3))

    df['r9'] = np.log(df['High'].shift(1) / df['Open'].shift(1))
    df['r10'] = np.log(df['High'].shift(2) / df['Open'].shift(2))
    df['r11'] = np.log(df['High'].shift(3) / df['Open'].shift(3))

    df['r12'] = np.log(df['Low'] / df['Open'])
    df['r13'] = np.log(df['Low'].shift(1) / df['Open'].shift(1))
    df['r14'] = np.log(df['Low'].shift(2) / df['Open'].shift(2))
    df['r15'] = np.log(df['Low'].shift(3) / df['Open'].shift(3))
    df['RSI'] = compute_rsi(df['Close'], period=14)
    df['Momentum'] = df['Close'] - df['Close'].shift(10)
    df['TrueRange'] = compute_true_range(df)
    df['ATR'] = df['TrueRange'].rolling(window=14).mean()
    df['ParabolicSAR'] = compute_parabolic_sar(df, max_acc=0)


    # Xử lý dữ liệu vô hạn
    df.replace([np.inf, -np.inf], 0, inplace=True)
    # Tạo cột dự đoán    
    df['Close'] = df['Close'].shift(-1)
        # Loại bỏ cột không cần thiết
    df.drop(columns=['High', 'Low', 'Open', 'Volume'], errors='ignore', inplace=True)

    # Điền giá trị thiếu
    df.fillna(method='bfill', inplace=True)
    return df

def compute_rsi(series, period=14):
    delta = series.diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=period).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=period).mean()
    rs = gain / loss
    return 100 - (100 / (1 + rs))

def compute_true_range(df):
    tr1 = df['High'] - df['Low']
    tr2 = abs(df['High'] - df['Close'].shift(1))
    tr3 = abs(df['Low'] - df['Close'].shift(1))
    return pd.concat([tr1, tr2, tr3], axis=1).max(axis=1)

def compute_parabolic_sar(df, acc=0.02, max_acc=0.2):
    sar = df['Close'].copy()
    return sar  # Placeholder, implement SAR logic if needed

folder_path = "../Data_FTSE_01012015_30122024/"

for file in os.listdir(folder_path):
    file_path = os.path.join(folder_path, file)
    df = input_data(file_path)
    df = compute_features(df)
    print(df)
    # break
    df.to_csv(rf"G:\FintechResearch\portfolio-formation-with-ltsm\Data_FTSE_model\{file}", index=False)


  df.fillna(method='bfill', inplace=True)


            Date        Close        lag_1        lag_5       lag_10  \
0     2015-01-02  1003.065369   988.638916   988.638916   988.638916   
1     2015-01-05   941.116516  1003.065369   988.638916   988.638916   
2     2015-01-06   941.116516   941.116516   988.638916   988.638916   
3     2015-01-07   959.785828   941.116516   988.638916   988.638916   
4     2015-01-08   966.574951   959.785828   988.638916   988.638916   
...          ...          ...          ...          ...          ...   
2520  2024-12-20  4941.818359  4993.440430  5096.684570  6226.413086   
2521  2024-12-23  4912.036621  4941.818359  5049.033203  5352.809082   
2522  2024-12-24  4906.080078  4912.036621  5128.452148  5261.478027   
2523  2024-12-27  4895.160156  4906.080078  4953.730957  5209.855957   
2524  2024-12-30          NaN  4895.160156  4993.440430  5158.233887   

            r1        r2        r3        r4        r5  ...       r11  \
0     0.014487  0.014487  0.014487  0.014487  0.006478  ...  0

  df.fillna(method='bfill', inplace=True)


            Date       Close       lag_1       lag_5      lag_10        r1  \
0     2015-01-02  177.574844  183.968109  183.968109  183.968109 -0.035370   
1     2015-01-05  174.283600  177.574844  183.968109  183.968109 -0.035370   
2     2015-01-06  174.472733  174.283600  183.968109  183.968109 -0.018708   
3     2015-01-07  179.352844  174.472733  183.968109  183.968109  0.001085   
4     2015-01-08  174.775406  179.352844  183.968109  183.968109  0.027587   
...          ...         ...         ...         ...         ...       ...   
2520  2024-12-20  261.600006  260.250000  270.299988  267.549988  0.000000   
2521  2024-12-23  263.549988  261.600006  264.549988  264.200012  0.005174   
2522  2024-12-24  264.899994  263.549988  267.049988  266.649994  0.007426   
2523  2024-12-27  264.750000  264.899994  260.250000  270.700012  0.005109   
2524  2024-12-30         NaN  264.750000  260.250000  269.399994 -0.000566   

            r2        r3        r4        r5  ...       r11    

  df.fillna(method='bfill', inplace=True)


            Date        Close        lag_1        lag_5       lag_10  \
0     2015-01-02  1791.766602  1812.503662  1812.503662  1812.503662   
1     2015-01-05  1795.966431  1791.766602  1812.503662  1812.503662   
2     2015-01-06  1790.191406  1795.966431  1812.503662  1812.503662   
3     2015-01-07  1844.002075  1790.191406  1812.503662  1812.503662   
4     2015-01-08  1822.740479  1844.002075  1812.503662  1812.503662   
...          ...          ...          ...          ...          ...   
2520  2024-12-20  2875.000000  2879.000000  2919.541260  2923.461426   
2521  2024-12-23  2880.000000  2875.000000  2885.239746  2909.740723   
2522  2024-12-24  2893.000000  2880.000000  2891.119873  2939.141846   
2523  2024-12-27  2875.000000  2893.000000  2894.000000  2910.720703   
2524  2024-12-30          NaN  2875.000000  2879.000000  2932.281738   

            r1        r2        r3        r4        r5  ...       r11  \
0    -0.011507 -0.011507 -0.011507 -0.011507  0.004845  ...  0

  df.fillna(method='bfill', inplace=True)


            Date       Close       lag_1       lag_5      lag_10        r1  \
0     2015-01-02  215.804077  227.294754  227.294754  227.294754 -0.051877   
1     2015-01-05  216.551620  215.804077  227.294754  227.294754 -0.051877   
2     2015-01-06  218.046844  216.551620  227.294754  227.294754  0.003458   
3     2015-01-07  224.276642  218.046844  227.294754  227.294754  0.006881   
4     2015-01-08  220.760254  224.276642  227.294754  227.294754  0.028170   
...          ...         ...         ...         ...         ...       ...   
2520  2024-12-20  380.700012  379.049988  385.100006  393.850006  0.000000   
2521  2024-12-23  381.250000  380.700012  380.100006  395.500000  0.004344   
2522  2024-12-24  385.450012  381.250000  383.399994  394.049988  0.001444   
2523  2024-12-27  386.750000  385.450012  379.049988  397.049988  0.010956   
2524  2024-12-30         NaN  386.750000  379.049988  395.950012  0.003367   

            r2        r3        r4        r5  ...       r11    

  df.fillna(method='bfill', inplace=True)


            Date        Close        lag_1        lag_5       lag_10  \
0     2015-01-02  1747.779541  1626.337524  1626.337524  1626.337524   
1     2015-01-05  1762.549561  1747.779541  1626.337524  1626.337524   
2     2015-01-06  1782.243164  1762.549561  1626.337524  1626.337524   
3     2015-01-07  1784.704712  1782.243164  1626.337524  1626.337524   
4     2015-01-08  1873.324341  1784.704712  1626.337524  1626.337524   
...          ...          ...          ...          ...          ...   
2520  2024-12-20  1988.000000  1953.000000  1938.000000  1958.000000   
2521  2024-12-23  1989.000000  1988.000000  1915.000000  1956.000000   
2522  2024-12-24  1982.000000  1989.000000  1928.000000  1964.000000   
2523  2024-12-27  1984.000000  1982.000000  1935.000000  1954.000000   
2524  2024-12-30          NaN  1984.000000  1953.000000  1934.000000   

            r1        r2        r3        r4        r5  ...       r11  \
0     0.072016  0.072016  0.072016  0.072016  0.023685  ...  0

  df.fillna(method='bfill', inplace=True)


            Date        Close        lag_1        lag_5       lag_10  \
0     2015-01-02   608.478699   597.621155   597.621155   597.621155   
1     2015-01-05   599.883179   608.478699   597.621155   597.621155   
2     2015-01-06   604.407227   599.883179   597.621155   597.621155   
3     2015-01-07   617.979187   604.407227   597.621155   597.621155   
4     2015-01-08   625.217651   617.979187   597.621155   597.621155   
...          ...          ...          ...          ...          ...   
2520  2024-12-20  2720.000000  2725.000000  2762.121826  2779.067383   
2521  2024-12-23  2730.000000  2720.000000  2774.083252  2768.102539   
2522  2024-12-24  2722.000000  2730.000000  2800.000000  2785.048096   
2523  2024-12-27  2688.000000  2722.000000  2733.000000  2747.169922   
2524  2024-12-30          NaN  2688.000000  2725.000000  2724.243408   

            r1        r2        r3        r4        r5  ...       r11  \
0     0.018005  0.018005  0.018005  0.018005  0.000000  ...  0

  df.fillna(method='bfill', inplace=True)


            Date        Close        lag_1        lag_5       lag_10  \
0     2015-01-02  2645.868896  2645.868896  2645.868896  2645.868896   
1     2015-01-05  2633.365967  2645.868896  2645.868896  2645.868896   
2     2015-01-06  2631.022461  2633.365967  2645.868896  2645.868896   
3     2015-01-07  2700.567627  2631.022461  2645.868896  2645.868896   
4     2015-01-08  2739.638428  2700.567627  2645.868896  2645.868896   
...          ...          ...          ...          ...          ...   
2520  2024-12-20  1334.000000  1336.000000  1397.000000  1408.000000   
2521  2024-12-23  1342.000000  1334.000000  1378.000000  1390.000000   
2522  2024-12-24  1335.000000  1342.000000  1351.000000  1391.000000   
2523  2024-12-27  1330.000000  1335.000000  1335.000000  1371.000000   
2524  2024-12-30          NaN  1330.000000  1336.000000  1352.000000   

            r1        r2        r3        r4        r5  ...       r11  \
0     0.000000  0.000000  0.000000  0.000000  0.000000  ...  0

  df.fillna(method='bfill', inplace=True)


            Date       Close       lag_1       lag_5      lag_10        r1  \
0     2015-01-02  124.010918  125.748894  125.748894  125.748894 -0.013917   
1     2015-01-05  121.966209  124.010918  125.748894  125.748894 -0.013917   
2     2015-01-06  123.244148  121.966209  125.748894  125.748894 -0.016626   
3     2015-01-07  125.135498  123.244148  125.748894  125.748894  0.010423   
4     2015-01-08  124.777687  125.135498  125.748894  125.748894  0.015230   
...          ...         ...         ...         ...         ...       ...   
2520  2024-12-20  224.100006  223.899994  228.800003  236.000000  0.001341   
2521  2024-12-23  225.399994  224.100006  226.600006  232.100006  0.000893   
2522  2024-12-24  226.800003  225.399994  225.800003  233.199997  0.005784   
2523  2024-12-27  227.100006  226.800003  223.600006  232.000000  0.006192   
2524  2024-12-30         NaN  227.100006  223.899994  232.199997  0.001322   

            r2        r3        r4        r5  ...       r11    

  df.fillna(method='bfill', inplace=True)


            Date        Close        lag_1        lag_5       lag_10  \
0     2015-01-02   824.373962   835.744507   835.744507   835.744507   
1     2015-01-05   813.003357   824.373962   835.744507   835.744507   
2     2015-01-06   810.160522   813.003357   835.744507   835.744507   
3     2015-01-07   826.505859   810.160522   835.744507   835.744507   
4     2015-01-08   836.455200   826.505859   835.744507   835.744507   
...          ...          ...          ...          ...          ...   
2520  2024-12-20  1272.500000  1269.000000  1274.500000  1248.000000   
2521  2024-12-23  1278.500000  1272.500000  1276.000000  1237.500000   
2522  2024-12-24  1290.000000  1278.500000  1278.500000  1279.000000   
2523  2024-12-27  1281.500000  1290.000000  1265.500000  1270.500000   
2524  2024-12-30          NaN  1281.500000  1269.000000  1282.000000   

            r1        r2        r3        r4        r5  ...       r11  \
0    -0.013699 -0.013699 -0.013699 -0.013699  0.000000  ...  0

  df.fillna(method='bfill', inplace=True)


            Date        Close        lag_1        lag_5       lag_10  \
0     2015-01-02   856.094727   864.806274   864.806274   864.806274   
1     2015-01-05   844.215576   856.094727   864.806274   864.806274   
2     2015-01-06   852.927002   844.215576   864.806274   864.806274   
3     2015-01-07   865.598267   852.927002   864.806274   864.806274   
4     2015-01-08   863.222168   865.598267   864.806274   864.806274   
...          ...          ...          ...          ...          ...   
2520  2024-12-20  3608.000000  3629.000000  3709.000000  3677.000000   
2521  2024-12-23  3617.000000  3608.000000  3707.000000  3692.000000   
2522  2024-12-24  3631.000000  3617.000000  3730.000000  3713.000000   
2523  2024-12-27  3604.000000  3631.000000  3654.000000  3740.000000   
2524  2024-12-30          NaN  3604.000000  3629.000000  3727.000000   

            r1        r2        r3        r4        r5  ...       r11  \
0    -0.010124 -0.010124 -0.010124 -0.010124  0.000905  ...  0

  df.fillna(method='bfill', inplace=True)


            Date        Close        lag_1       lag_5      lag_10        r1  \
0     2015-01-02  3917.394531  3906.051270  3906.05127  3906.05127  0.002900   
1     2015-01-05  3864.457031  3917.394531  3906.05127  3906.05127  0.002900   
2     2015-01-06  3909.831299  3864.457031  3906.05127  3906.05127 -0.013606   
3     2015-01-07  3977.895020  3909.831299  3906.05127  3906.05127  0.011673   
4     2015-01-08  3966.551270  3977.895020  3906.05127  3906.05127  0.017259   
...          ...          ...          ...         ...         ...       ...   
2520  2024-12-20  4817.000000  4808.000000  4856.00000  4738.00000 -0.002493   
2521  2024-12-23  4816.000000  4817.000000  4843.00000  4703.00000  0.001870   
2522  2024-12-24  4828.000000  4816.000000  4823.00000  4835.00000 -0.000208   
2523  2024-12-27  4807.000000  4828.000000  4820.00000  4837.00000  0.002489   
2524  2024-12-30          NaN  4807.000000  4808.00000  4857.00000 -0.004359   

            r2        r3        r4     

  df.fillna(method='bfill', inplace=True)


            Date       Close       lag_1       lag_5      lag_10        r1  \
0     2015-01-02  303.272675  308.453918  308.453918  308.453918 -0.016940   
1     2015-01-05  295.097961  303.272675  308.453918  308.453918 -0.016940   
2     2015-01-06  297.515808  295.097961  308.453918  308.453918 -0.027325   
3     2015-01-07  306.266296  297.515808  308.453918  308.453918  0.008160   
4     2015-01-08  303.157562  306.266296  308.453918  308.453918  0.028988   
...          ...         ...         ...         ...         ...       ...   
2520  2024-12-20  307.600006  312.200012  316.600006  316.799988  0.001924   
2521  2024-12-23  310.600006  307.600006  312.399994  314.000000 -0.014844   
2522  2024-12-24  313.399994  310.600006  315.200012  315.600006  0.009706   
2523  2024-12-27  316.600006  313.399994  311.600006  318.799988  0.008974   
2524  2024-12-30         NaN  316.600006  312.200012  318.000000  0.010159   

            r2        r3        r4        r5  ...       r11    

  df.fillna(method='bfill', inplace=True)


            Date        Close        lag_1        lag_5       lag_10  \
0     2015-01-02   366.047272   366.047272   366.047272   366.047272   
1     2015-01-05   359.787384   366.047272   366.047272   366.047272   
2     2015-01-06   359.943848   359.787384   366.047272   366.047272   
3     2015-01-07   364.325806   359.943848   366.047272   366.047272   
4     2015-01-08   361.978333   364.325806   366.047272   366.047272   
...          ...          ...          ...          ...          ...   
2520  2024-12-20  1275.685913  1276.675537  1285.582642  1289.541260   
2521  2024-12-23  1279.149780  1275.685913  1291.520630  1286.572266   
2522  2024-12-24  1277.665283  1279.149780  1309.334717  1276.180786   
2523  2024-12-27  1271.727173  1277.665283  1292.015503  1293.005127   
2524  2024-12-30          NaN  1271.727173  1276.675537  1293.994873   

            r1        r2        r3        r4        r5  ...       r11  \
0     0.000000  0.000000  0.000000  0.000000  0.007697  ...  0

  df.fillna(method='bfill', inplace=True)


            Date        Close        lag_1        lag_5       lag_10  \
0     2015-01-02  1246.865967  1299.618164  1299.618164  1299.618164   
1     2015-01-05  1246.566406  1246.865967  1299.618164  1299.618164   
2     2015-01-06  1257.655884  1246.566406  1299.618164  1299.618164   
3     2015-01-07  1284.331787  1257.655884  1299.618164  1299.618164   
4     2015-01-08  1262.750854  1284.331787  1299.618164  1299.618164   
...          ...          ...          ...          ...          ...   
2520  2024-12-20  2409.000000  2394.500000  2458.500000  2536.500000   
2521  2024-12-23  2429.000000  2409.000000  2413.500000  2522.500000   
2522  2024-12-24  2440.500000  2429.000000  2419.000000  2492.500000   
2523  2024-12-27  2442.000000  2440.500000  2405.500000  2505.000000   
2524  2024-12-30          NaN  2442.000000  2394.500000  2498.500000   

            r1        r2        r3        r4        r5  ...       r11  \
0    -0.041437 -0.041437 -0.041437 -0.041437  0.001379  ...  0

  df.fillna(method='bfill', inplace=True)


            Date        Close        lag_1        lag_5       lag_10  \
0     2015-01-02   870.877441   882.972839   882.972839   882.972839   
1     2015-01-05   879.674316   870.877441   882.972839   882.972839   
2     2015-01-06   879.674316   879.674316   882.972839   882.972839   
3     2015-01-07   879.674316   879.674316   882.972839   882.972839   
4     2015-01-08   876.924988   879.674316   882.972839   882.972839   
...          ...          ...          ...          ...          ...   
2520  2024-12-20  1583.293335  1584.773560  1610.923340  1676.544312   
2521  2024-12-23  1584.280151  1583.293335  1595.628174  1653.848389   
2522  2024-12-24  1585.760376  1584.280151  1591.187622  1638.059814   
2523  2024-12-27  1590.200806  1585.760376  1576.879272  1650.394653   
2524  2024-12-30          NaN  1590.200806  1584.773560  1646.447510   

            r1        r2        r3        r4        r5  ...       r11  \
0    -0.013793 -0.013793 -0.013793 -0.013793  0.000000  ...  0

  df.fillna(method='bfill', inplace=True)


            Date       Close       lag_1        lag_5      lag_10        r1  \
0     2015-01-02  699.211853  710.284790   710.284790  710.284790 -0.015712   
1     2015-01-05  699.138062  699.211853   710.284790  710.284790 -0.015712   
2     2015-01-06  709.989502  699.138062   710.284790  710.284790 -0.000106   
3     2015-01-07  716.559326  709.989502   710.284790  710.284790  0.015402   
4     2015-01-08  697.661743  716.559326   710.284790  710.284790  0.009211   
...          ...         ...         ...          ...         ...       ...   
2520  2024-12-20  980.799988  972.000000   991.400024  989.200012 -0.011863   
2521  2024-12-23  990.799988  980.799988   990.000000  987.799988  0.009013   
2522  2024-12-24  982.599976  990.799988  1000.500000  989.000000  0.010144   
2523  2024-12-27  984.799988  982.599976   983.599976  991.599976 -0.008311   
2524  2024-12-30         NaN  984.799988   972.000000  996.799988  0.002236   

            r2        r3        r4        r5  ...  

  df.fillna(method='bfill', inplace=True)


            Date       Close       lag_1      lag_5     lag_10        r1  \
0     2015-01-02  542.674133  557.136230  557.13623  557.13623 -0.026301   
1     2015-01-05  528.900757  542.674133  557.13623  557.13623 -0.026301   
2     2015-01-06  528.556396  528.900757  557.13623  557.13623 -0.025708   
3     2015-01-07  547.839233  528.556396  557.13623  557.13623 -0.000651   
4     2015-01-08  539.575256  547.839233  557.13623  557.13623  0.035832   
...          ...         ...         ...        ...        ...       ...   
2520  2024-12-20  861.000000  863.000000  905.00000  850.00000 -0.018939   
2521  2024-12-23  865.000000  861.000000  891.50000  864.00000 -0.002320   
2522  2024-12-24  865.500000  865.000000  900.50000  878.50000  0.004635   
2523  2024-12-27  849.000000  865.500000  879.50000  896.00000  0.000578   
2524  2024-12-30         NaN  849.000000  863.00000  914.00000 -0.019248   

            r2        r3        r4        r5  ...       r11       r12  \
0    -0.026301

  df.fillna(method='bfill', inplace=True)


            Date       Close       lag_1       lag_5      lag_10        r1  \
0     2015-01-02  148.077835  153.296478  153.296478  153.296478 -0.034636   
1     2015-01-05  145.794754  148.077835  153.296478  153.296478 -0.034636   
2     2015-01-06  148.404037  145.794754  153.296478  153.296478 -0.015538   
3     2015-01-07  170.623825  148.404037  153.296478  153.296478  0.017739   
4     2015-01-08  166.424469  170.623825  153.296478  153.296478  0.139523   
...          ...         ...         ...         ...         ...       ...   
2520  2024-12-20  366.000000  366.399994  373.100006  364.799988 -0.005715   
2521  2024-12-23  368.500000  366.000000  374.000000  368.600006 -0.001092   
2522  2024-12-24  366.899994  368.500000  372.899994  370.200012  0.006807   
2523  2024-12-27  365.600006  366.899994  368.500000  370.600006 -0.004351   
2524  2024-12-30         NaN  365.600006  366.399994  372.000000 -0.003549   

            r2        r3        r4        r5  ...       r11    

  df.fillna(method='bfill', inplace=True)


            Date       Close       lag_1       lag_5      lag_10        r1  \
0     2015-01-02   14.006000   14.620667   14.620667   14.620667 -0.042950   
1     2015-01-05   14.085333   14.006000   14.620667   14.620667 -0.042950   
2     2015-01-06   14.063333   14.085333   14.620667   14.620667  0.005648   
3     2015-01-07   14.041333   14.063333   14.620667   14.620667 -0.001563   
4     2015-01-08   13.777333   14.041333   14.620667   14.620667 -0.001566   
...          ...         ...         ...         ...         ...       ...   
2510  2024-12-23  462.279999  430.600006  479.859985  400.989990  0.022404   
2511  2024-12-24  454.130005  462.279999  440.130005  424.769989  0.070991   
2512  2024-12-26  431.660004  454.130005  436.170013  418.100006 -0.017787   
2513  2024-12-27  417.410004  431.660004  421.059998  436.230011 -0.050745   
2514  2024-12-30         NaN  417.410004  430.600006  463.019989 -0.033569   

            r2        r3        r4        r5  ...       r11    

  df.fillna(method='bfill', inplace=True)


In [None]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
import pandas as pd

def select_features(X, y, n_features_to_select=20, feature_range=(-1, 1)):
    """
    Hàm này thực hiện chọn đặc trưng sử dụng RFE với Logistic Regression và chuẩn hóa dữ liệu.

    Parameters:
    - X (pd.DataFrame): DataFrame chứa các đặc trưng đầu vào.
    - y (pd.Series): Series chứa biến mục tiêu (lợi nhuận tại thời điểm t+1).
    - n_features_to_select (int): Số lượng đặc trưng cần chọn (mặc định là 20).
    - feature_range (tuple): Phạm vi giá trị để chuẩn hóa dữ liệu (mặc định là (-1, 1)).

    Returns:
    - selected_features (list): Danh sách các đặc trưng được chọn.
    - X_selected (pd.DataFrame): DataFrame chứa các đặc trưng được chọn.
    - X_scaled (pd.DataFrame): DataFrame chứa các đặc trưng được chọn và đã chuẩn hóa.
    """

    # 1. Chuẩn hóa dữ liệu
    scaler = MinMaxScaler(feature_range=feature_range)
    X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

    # 2. Chọn đặc trưng sử dụng RFE với Logistic Regression
    model = LogisticRegression()
    rfe = RFE(model, n_features_to_select=n_features_to_select)
    rfe.fit(X_scaled, y)

    # 3. Lấy các đặc trưng được chọn
    selected_features = X.columns[rfe.support_].tolist()
    X_selected = X[selected_features]
    X_scaled_selected = X_scaled[selected_features]

    return selected_features, X_selected, X_scaled_selected

# Ví dụ sử dụng hàm
if __name__ == "__main__":
    # Giả sử X là DataFrame chứa các đặc trưng và y là Series chứa biến mục tiêu
    # Ví dụ:
    # X = pd.DataFrame({
    #     'Open': [...],
    #     'Close': [...],
    #     'High': [...],
    #     'Low': [...],
    #     'Volume': [...],
    #     'RSI': [...],
    #     'MoM': [...],
    #     'TR': [...],
    #     'ATR': [...],
    #     'Parabolic_SAR': [...],
    #     # Thêm các đặc trưng khác...
    # })
    # y = pd.Series([...])  # Lợi nhuận tại thời điểm t+1

    # Gọi hàm để chọn đặc trưng
    selected_features, X_selected, X_scaled_selected = select_features(X, y)

    # In kết quả
    print("Selected Features:", selected_features)
    print("\nSelected Features DataFrame:")
    print(X_selected.head())
    print("\nScaled Selected Features DataFrame:")
    print(X_scaled_selected.head())

In [1]:
import pandas as pd
import numpy as np

# Tạo dữ liệu mẫu cho 10 ngày
data = {
    'Date': pd.date_range(start='2023-10-01', periods=10, freq='D'),  # Ngày từ 01/10/2023 đến 10/10/2023
    'Open': np.random.uniform(100, 110, 10).round(2),  # Giá mở cửa ngẫu nhiên từ 100 đến 110
    'High': np.random.uniform(110, 120, 10).round(2),  # Giá cao nhất ngẫu nhiên từ 110 đến 120
    'Low': np.random.uniform(90, 100, 10).round(2),    # Giá thấp nhất ngẫu nhiên từ 90 đến 100
    'Close': np.random.uniform(100, 110, 10).round(2), # Giá đóng cửa ngẫu nhiên từ 100 đến 110
    'Volume': np.random.randint(1000, 5000, 10)        # Khối lượng giao dịch ngẫu nhiên từ 1000 đến 5000
}

# Tạo DataFrame từ dữ liệu
df = pd.DataFrame(data)

# In ra dữ liệu
print("Sample Data:")
print(df)

Sample Data:
        Date    Open    High    Low   Close  Volume
0 2023-10-01  109.23  111.93  97.95  103.76    2889
1 2023-10-02  101.08  110.47  93.99  101.79    1634
2 2023-10-03  103.55  114.45  95.73  102.53    4330
3 2023-10-04  106.09  111.48  95.19  102.39    1480
4 2023-10-05  102.51  119.07  93.99  104.14    1633
5 2023-10-06  103.78  111.78  90.77  101.99    4903
6 2023-10-07  101.47  113.92  99.24  101.54    1720
7 2023-10-08  100.34  115.32  94.16  105.92    3497
8 2023-10-09  107.43  112.57  96.84  108.44    1982
9 2023-10-10  100.49  117.79  98.18  104.11    3155


In [5]:
# Thiết lập các tham số
train_size = 5  # Kích thước tập huấn luyện (5 ngày)
test_size = 2   # Kích thước tập kiểm tra (2 ngày)
step_size = 2   # Bước trượt (2 ngày)

# Tạo các study periods
study_periods = []
for i in range(0, len(df) - train_size - test_size + 1, step_size):
    train_start = i
    train_end = i + train_size
    test_start = train_end
    test_end = test_start + test_size
    
    train_set = df.iloc[train_start:train_end]
    test_set = df.iloc[test_start:test_end]
    
    study_periods.append((train_set, test_set))
# In ra số lượng study periods
print(f"Total study periods: {len(study_periods)}")
for i in range(len(study_periods)):
    # Ví dụ: In ra thông tin của study period đầu tiên
    train_set, test_set = study_periods[i]
    print("\nTraining set (first study period):")
    print(train_set[['Date', 'Close']])
    print("\nTesting set (first study period):")
    print(test_set[['Date', 'Close']])

Total study periods: 2

Training set (first study period):
        Date   Close
0 2023-10-01  103.76
1 2023-10-02  101.79
2 2023-10-03  102.53
3 2023-10-04  102.39
4 2023-10-05  104.14

Testing set (first study period):
        Date   Close
5 2023-10-06  101.99
6 2023-10-07  101.54

Training set (first study period):
        Date   Close
2 2023-10-03  102.53
3 2023-10-04  102.39
4 2023-10-05  104.14
5 2023-10-06  101.99
6 2023-10-07  101.54

Testing set (first study period):
        Date   Close
7 2023-10-08  105.92
8 2023-10-09  108.44
