In [1]:
# preprocess.py
import pandas as pd
import numpy as np

def extract_X(csv_file):
    X_df = pd.read_csv(csv_file, index_col=[0], header=[0, 1, 2])
    X_arr = np.array(X_df)
    X_shape = X_arr.shape
    X_arr = np.reshape(X_arr, (X_shape[0], -1, 3, 24))
    X_arr = X_arr[:, :, 1, :]
    return X_arr

def extract_X_clean(csv_file):
    X_df = pd.read_csv(csv_file, index_col=[0], header=[0, 1, 2])
    X_arr = np.array(X_df)
    X_shape = X_arr.shape
    X_arr = np.reshape(X_arr, (X_shape[0], -1, 3, 24))
    def check_mask(sample):
        mask = sample[:, 0, :] # (104, 24)
        mask = np.sum(mask, axis=-1) # (104)
        mask = np.array(list(map(lambda item: max(0, 1-item), mask)))
        all_fake = np.average(mask, axis=-1)
        return all_fake < 0.7
    X_arr = np.array(list(filter(check_mask, X_arr)))
    X_arr = X_arr[:, :, 1, :]
    return X_arr

def extract_y(csv_file):
    y_df = pd.read_csv(csv_file, index_col=[0], header=[0])
    y_arr = np.array(y_df)[:, 0]
    return y_arr

def time_reduction(X_arr, factor):
    n, a, t = X_arr.shape
    reduced_arr = np.reshape(X_arr, (n, a, -1, factor))
    reduced_arr = np.average(reduced_arr, axis = -1)
    return reduced_arr

def attribute_reduction(X_arr, n_components):
    from sklearn.decomposition import PCA
    n, a, t = X_arr.shape
    processed_arr = np.transpose(X_arr, (0, 2, 1))
    processed_arr = np.reshape(processed_arr, (-1, a))
    pca = PCA(n_components = n_components)
    new_arr = pca.fit_transform(processed_arr)
    new_arr = np.reshape(new_arr, (n, t, n_components))
    new_arr = np.transpose(new_arr, (0, 2, 1))
    new_arr = np.reshape(new_arr, (n, n_components, t))
    exp_var_ratio = sum(pca.explained_variance_ratio_)
    return new_arr, exp_var_ratio


In [2]:
X_train = extract_X("X_train.csv")
X_valid = extract_X("X_valid.csv")
X_train.shape, X_valid.shape

((16760, 104, 24), (2394, 104, 24))

In [3]:
X_train_clean = extract_X_clean("X_train.csv")
X_valid_clean = extract_X_clean("X_valid.csv")
X_train_clean.shape, X_valid_clean.shape

((16270, 104, 24), (2322, 104, 24))

In [4]:
Y_train = extract_y("Y_train.csv")
Y_valid = extract_y("Y_valid.csv")
Y_train.shape, Y_valid.shape

((16760,), (2394,))

In [41]:
X_train_attred, explained_var = attribute_reduction(X_train, n_components=50)
X_train_attred.shape, explained_var

((16760, 50, 24), 0.932844191622672)

In [47]:
X_train_attred_2d = X_train_attred.reshape((16760,-1))
X_train_attred_2d.shape

(16760, 1200)

In [48]:
X_train_timered = time_reduction(X_train, factor=24)
X_train_timered.shape

(16760, 104, 1)

In [49]:
X_train_timered_2d = X_train_timered.reshape((16760,-1))
X_train_timered_2d.shape

(16760, 104)

In [43]:
X_train_attred_timered = time_reduction(X_train_attred, factor=24)
X_train_attred_timered.shape

(16760, 50, 1)

In [45]:
X_train_attred_timered_2d = X_train_attred_timered.reshape((16760,-1))
X_train_attred_timered_2d.shape

(16760, 50)

In [51]:
X_train_timered_attred, explained_var = attribute_reduction(X_train_timered, n_components=50)
X_train_timered_attred.shape, explained_var

((16760, 50, 1), 0.9353544209126656)

In [52]:
X_train_timered_attred_2d = X_train_timered_attred.reshape((16760,-1))
X_train_timered_attred_2d.shape

(16760, 50)

In [53]:
X_train_timered_attred_2d #(16760, 50)

array([[-3.7167648 ,  1.51434426, -1.27844754, ...,  0.16391728,
         0.02418093,  0.16704015],
       [-5.06459685,  4.09033165,  1.21490575, ..., -0.13790436,
         0.39084452,  0.15356615],
       [ 3.81611134,  1.43622642, -0.21673833, ..., -0.11156567,
        -0.65812409,  0.2171959 ],
       ...,
       [ 0.1995888 , -1.69666611,  0.52136827, ..., -0.76009977,
        -0.05997844,  0.07980293],
       [ 0.87826728, -1.16592274, -0.63602197, ..., -0.06330783,
        -0.40680029,  0.19852433],
       [ 1.1008677 , -2.45473191,  2.89602819, ...,  0.47785689,
        -0.3898703 , -0.19430892]])

In [54]:
X_train_attred_timered_2d #(16760, 50)

array([[-3.34700763,  2.3045836 , -1.45101966, ...,  0.2811827 ,
        -0.19297459, -0.27443401],
       [-4.12580999,  4.57656669,  1.37092252, ...,  0.8868439 ,
        -0.42285247,  0.07366775],
       [ 4.13594537,  0.89170379,  0.01958342, ..., -0.61409808,
         0.07490726,  0.62663711],
       ...,
       [-0.14912002, -1.77340975,  0.7146281 , ...,  0.03880143,
         0.11131045, -0.36076324],
       [ 1.31102798, -0.80681703, -1.75649397, ..., -0.05046646,
         0.16655009, -0.5446653 ],
       [ 0.67164911, -2.98895675,  2.41128945, ...,  0.35644128,
        -0.56316309, -0.33986278]])

In [55]:
X_train_attred_2d #(16760, 1200)

array([[-4.098482  , -4.98655838, -4.52018059, ...,  0.2731897 ,
        -0.11043026, -0.48275764],
       [-5.04038331, -4.38769645, -4.17356526, ...,  0.13391565,
         0.14907133,  0.1565493 ],
       [ 5.33360001,  5.84861166,  5.49821287, ...,  0.65132909,
         0.67960077,  0.66847897],
       ...,
       [-0.56383084, -0.62747357, -0.72725797, ..., -0.58780428,
        -0.31260371, -0.29285985],
       [ 1.23891794,  1.14432611,  0.7492249 , ..., -0.84072832,
        -0.85482644, -0.87691826],
       [ 0.50652772,  0.42775387,  0.61347979, ...,  0.28939582,
         0.27502291,  0.28345536]])

In [56]:
X_train_timered_2d #(16760, 104)

array([[-0.25445971, -1.9798553 ,  0.        , ...,  1.0260044 ,
         0.82364104,  0.16006746],
       [-0.25659861, -0.25180589,  0.        , ...,  0.        ,
        -0.18158883,  0.        ],
       [-0.26943203,  0.        ,  0.        , ...,  0.        ,
        -0.20568015,  0.        ],
       ...,
       [-0.26087642,  1.04423117,  0.        , ..., -0.4429212 ,
        -0.7465692 ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.78140173,
        -0.58570134, -0.30376832],
       [-0.2437652 ,  0.        ,  0.        , ..., -0.60815335,
        -0.44911908,  0.        ]])

In [9]:
!pip install git+https://github.com/scikit-learn-contrib/py-earth@v0.2dev
from pyearth import Earth

Collecting git+https://github.com/scikit-learn-contrib/py-earth@v0.2dev
  Cloning https://github.com/scikit-learn-contrib/py-earth (to revision v0.2dev) to /tmp/.jk101/pip-req-build-2urxgavz
Building wheels for collected packages: sklearn-contrib-py-earth
  Building wheel for sklearn-contrib-py-earth (setup.py) ... [?25ldone
[?25h  Created wheel for sklearn-contrib-py-earth: filename=sklearn_contrib_py_earth-0.1.0+16.g400f84d-cp38-cp38-linux_x86_64.whl size=4495091 sha256=bd20d37eb30393dc50f1bfa52f47bd28baa2cf2c2277bda3a392524d8febc4eb
  Stored in directory: /tmp/.jk101/pip-ephem-wheel-cache-6tbf9lkv/wheels/8a/33/e4/b577aeaf3435222ffd76339bd728390afc103237af5cc8064e
Successfully built sklearn-contrib-py-earth
Installing collected packages: sklearn-contrib-py-earth
Successfully installed sklearn-contrib-py-earth-0.1.0+16.g400f84d


In [57]:
model_1 = Earth()
model_1.fit(X_train_attred_timered_2d, Y_train)

model_2 = Earth()
model_2.fit(X_train_timered_attred_2d, Y_train)

Earth()

In [79]:
print(model_1.summary())

Earth Model
-----------------------------------------
Basis Function      Pruned  Coefficient  
-----------------------------------------
(Intercept)         No      3.82601      
h(x25-0.0937084)    No      -0.250341    
h(0.0937084-x25)    No      0.25963      
h(x17-0.110517)     Yes     None         
h(0.110517-x17)     No      2.03311      
h(x38-0.0541234)    No      0.349453     
h(0.0541234-x38)    No      0.307441     
h(x13+2.59091)      No      0.108762     
h(-2.59091-x13)     Yes     None         
h(x15-1.34439)      No      -0.225901    
h(1.34439-x15)      No      -0.263648    
h(x26+0.0864369)    No      -0.204302    
h(-0.0864369-x26)   No      0.223622     
h(x41+0.247598)     No      0.322729     
h(-0.247598-x41)    Yes     None         
h(x8-0.181088)      No      -0.0855003   
h(0.181088-x8)      No      0.157341     
h(x46+0.00227089)   No      -0.193472    
h(-0.00227089-x46)  No      0.429014     
h(x24+1.95214)      No      -0.170665    
h(-1.95214-x24)     No

In [80]:
print(model_2.summary())

Earth Model
----------------------------------------
Basis Function     Pruned  Coefficient  
----------------------------------------
(Intercept)        No      6.63729      
h(x21-0.118297)    No      -0.306748    
h(0.118297-x21)    No      0.60596      
h(x18+0.645615)    No      0.3632       
h(-0.645615-x18)   Yes     None         
h(x0+8.04866)      No      -0.105312    
h(-8.04866-x0)     Yes     None         
h(x48+0.0346998)   No      1.28119      
h(-0.0346998-x48)  Yes     None         
h(x46+0.0484209)   Yes     None         
h(-0.0484209-x46)  No      1.35357      
h(x12-2.10592)     Yes     None         
h(2.10592-x12)     No      -0.194487    
h(x5+1.27713)      No      0.118603     
h(-1.27713-x5)     Yes     None         
h(x7-3.85833)      No      -0.883161    
h(3.85833-x7)      No      -0.13502     
h(x1-2.23553)      Yes     None         
h(2.23553-x1)      No      -0.0825737   
h(x46-0.411713)    Yes     None         
h(0.411713-x46)    No      -1.19616     
h(x3

In [58]:
X_valid_attred, explained_var = attribute_reduction(X_valid, n_components=50)
X_valid_attred.shape, explained_var

((2394, 50, 24), 0.9353223164123017)

In [59]:
X_valid_attred_timered = time_reduction(X_valid_attred, factor=24)
X_valid_attred_timered.shape

(2394, 50, 1)

In [63]:
X_valid_attred_timered_2d = X_valid_attred_timered.reshape((2394,-1))
X_valid_attred_timered_2d.shape

(2394, 50)

In [64]:
X_valid_timered = time_reduction(X_valid, factor=24)
X_valid_timered.shape

(2394, 104, 1)

In [65]:
X_valid_timered_attred, explained_var = attribute_reduction(X_valid_timered, n_components=50)
X_valid_timered_attred.shape, explained_var

((2394, 50, 1), 0.9428063203092165)

In [66]:
X_valid_timered_attred_2d = X_valid_timered_attred.reshape((2394,-1))
X_valid_timered_attred_2d.shape

(2394, 50)

In [72]:
Y_pred_valid_1 = model_1.predict(X_valid_attred_timered_2d)
Y_pred_valid_2 = model_2.predict(X_valid_timered_attred_2d)

In [83]:
from sklearn.metrics import r2_score
model_1_r2 = r2_score(Y_valid, Y_pred_valid_1)
model_2_r2 = r2_score(Y_valid, Y_pred_valid_2)
model_1_r2, model_2_r2

(-0.42151839494688415, -0.19997696342137927)

In [75]:
Y_valid

array([1.67847222, 5.90171296, 3.54657407, ..., 9.14388889, 6.12417824,
       3.81853009])

In [74]:
Y_pred_valid_1

array([4.2481763 , 4.02596773, 3.70813788, ..., 4.23503787, 5.52226533,
       3.33064435])

In [76]:
Y_pred_valid_2

array([4.12816706, 4.71352202, 2.9411419 , ..., 2.8190631 , 2.7054267 ,
       4.61445224])

In [85]:
model_3 = Earth()
model_3.fit(X_train_attred_2d, Y_train)

model_4 = Earth()
model_4.fit(X_train_timered_2d, Y_train)

Earth()

In [86]:
print(model_3.summary())

Earth Model
------------------------------------------
Basis Function       Pruned  Coefficient  
------------------------------------------
(Intercept)          No      6.01862      
h(x623-0.174826)     No      -0.188049    
h(0.174826-x623)     No      0.269266     
h(x454-3.03418)      No      1.10725      
h(3.03418-x454)      No      -0.169514    
h(x215-0.457798)     No      -0.0781365   
h(0.457798-x215)     No      0.246628     
h(x1007+0.1148)      No      0.367032     
h(-0.1148-x1007)     No      -0.256112    
h(x1109+0.0676292)   Yes     None         
h(-0.0676292-x1109)  No      0.515741     
h(x569-0.406555)     Yes     None         
h(0.406555-x569)     No      0.193721     
h(x380-1.07977)      No      -0.218557    
h(1.07977-x380)      No      -0.233673    
h(x431-0.0404379)    Yes     None         
h(0.0404379-x431)    No      1.14603      
h(x599+1.55043)      No      -0.149663    
h(-1.55043-x599)     No      -1.22772     
h(x958+0.026217)     No      0.326897     

In [87]:
print(model_4.summary())

Earth Model
-----------------------------------------
Basis Function      Pruned  Coefficient  
-----------------------------------------
(Intercept)         No      7.62751      
h(x36-0.260185)     No      1.92768      
h(0.260185-x36)     No      -0.864811    
h(x72+0.0202196)    No      1.90652      
h(-0.0202196-x72)   No      0.469664     
h(x11-0.105422)     No      0.0671171    
h(0.105422-x11)     No      -0.336531    
h(x63-0.00219819)   Yes     None         
h(0.00219819-x63)   No      0.320706     
x80                 No      0.345292     
h(x72-0.271872)     No      -1.99825     
h(0.271872-x72)     Yes     None         
h(x87+0.127387)     No      0.210114     
h(-0.127387-x87)    No      0.156416     
h(x36+1.44115)      No      -1.62483     
h(-1.44115-x36)     Yes     None         
h(x94-0.0650407)    No      0.569798     
h(0.0650407-x94)    Yes     None         
h(x65-2.17499)      No      -6.75285     
h(2.17499-x65)      No      -0.281892    
h(x94+0.607522)     No

In [88]:
X_valid_attred, explained_var = attribute_reduction(X_valid, n_components=50)
X_valid_attred.shape, explained_var

((2394, 50, 24), 0.9353063062047959)

In [89]:
X_valid_attred_2d = X_valid_attred.reshape((2394,-1))
X_valid_attred_2d.shape

(2394, 1200)

In [90]:
X_valid_timered = time_reduction(X_valid, factor=24)
X_valid_timered.shape

(2394, 104, 1)

In [91]:
X_valid_timered_2d = X_valid_timered.reshape((2394,-1))
X_valid_timered_2d.shape

(2394, 104)

In [92]:
Y_pred_valid_3 = model_3.predict(X_valid_attred_2d)
Y_pred_valid_4 = model_4.predict(X_valid_timered_2d)

In [93]:
from sklearn.metrics import r2_score
model_3_r2 = r2_score(Y_valid, Y_pred_valid_3)
model_4_r2 = r2_score(Y_valid, Y_pred_valid_4)
model_3_r2, model_4_r2

(-0.16864823826425712, 0.1543887423371778)

In [94]:
Y_pred_valid_3

array([4.61770305, 4.11566832, 3.76346109, ..., 3.93101235, 3.67747942,
       4.02765402])

In [96]:
Y_pred_valid_4

array([3.36504249, 4.1161113 , 3.64608449, ..., 4.74264511, 3.03255754,
       3.02354274])

In [97]:
X_train_2d = X_train.reshape((16760,-1))
X_train_2d.shape

(16760, 2496)

In [98]:
full_model = Earth()
full_model.fit(X_train_2d, Y_train)

Earth()

In [99]:
print(full_model.summary())

Earth Model
------------------------------------------
Basis Function       Pruned  Coefficient  
------------------------------------------
(Intercept)          No      3.78486      
x887                 No      -0.533202    
h(x1512+0.0345025)   No      0.0752644    
h(-0.0345025-x1512)  No      0.48439      
h(x287-5.56036)      Yes     None         
h(5.56036-x287)      No      -0.516526    
h(x312-0.130466)     Yes     None         
h(0.130466-x312)     No      5.67042      
h(x1775-5.78469)     Yes     None         
h(5.78469-x1775)     No      -0.216875    
h(x2256-0.442305)    No      0.140769     
h(0.442305-x2256)    No      0.330827     
x1943                No      0.283736     
h(x2230-2.94535)     No      1.81885      
h(2.94535-x2230)     No      -0.118918    
x1567                No      0.20818      
h(x264-6.209)        Yes     None         
h(6.209-x264)        No      0.334525     
x1103                No      -0.236371    
h(x1127-6.90605)     No      1.8568       

In [100]:
X_valid_2d = X_valid.reshape((2394,-1))
X_valid_2d.shape

(2394, 2496)

In [102]:
Y_pred_full = full_model.predict(X_valid_2d)

In [103]:
from sklearn.metrics import r2_score
full_model_r2 = r2_score(Y_valid, Y_pred_full)
full_model_r2

0.13712012727545042

In [104]:
Y_pred_full

array([3.05595024, 4.94277173, 3.62239832, ..., 4.58804563, 3.24632155,
       3.30430503])