In [1]:
# external imports
import pandas as pd
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import PolynomialFeatures
from tqdm.notebook import tqdm


# internal imports
import sys
sys.path.insert(0,'../')
from data_utils import add_deltas_and_time, scale_data, column_combinations

# Polynomial Regression Testing

In [2]:
data_850 = pd.read_csv('../data_table_NA850_nonfiltered_updated.csv')

# get scaled provided data with deltas and time added
X = data_850.drop(columns=['LH'])
X = add_deltas_and_time(X)
X = scale_data(X)

y = data_850['LH']

X.head()

Unnamed: 0_level_0,AirTemp,QV,Omega,SeaLevPress,UWinds,VWinds,Lat,Lon,dAirTemp,dQV,dOmega,dSeaLevPress,dUWinds,dVWinds,dLat,dLon
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2000-01-01 00:00:00,-5.804057,-0.756024,0.70529,2.356635,-0.700062,-1.490048,1.131845,-0.118104,0.040359,0.003936,0.001865,-0.030874,0.021191,0.000609,0.04125,1.109128
2000-01-01 03:00:00,-5.780483,-0.713789,0.040104,2.218343,-0.580373,-0.932716,1.071432,-0.196542,0.108318,0.215155,-0.827905,-0.238713,0.408655,1.362894,-1.049213,-1.191825
2000-01-01 06:00:00,-5.799631,-0.742042,0.479295,1.993975,-0.403095,-0.620279,1.088863,-0.25868,-0.014841,-0.137358,0.549724,-0.368077,0.595083,0.764298,0.355878,-0.713664
2000-01-01 09:00:00,-5.643576,-0.797098,0.837784,2.075025,-0.338748,-0.606872,1.034351,-0.307337,0.49024,-0.271397,0.449055,0.090936,0.229495,0.033379,-0.942694,-0.318199
2000-01-01 12:00:00,-5.482052,-0.742448,0.514822,2.610151,-0.44177,-0.411291,1.059879,-0.378347,0.506004,0.277238,-0.401007,0.773369,-0.312314,0.478666,0.502027,-0.973919


In [3]:
poly_reg = Ridge(random_state=1)  # Ridge -> L2 regularization
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

# initialize metric list
log_csv = []

# test over degrees 1-4
degrees = [1, 2, 3, 4]
for degree in degrees:
    # test over all combinations of a maximum length
    for comb in tqdm(column_combinations(X.columns, max_len=6), desc=f"Degree {degree}"):
        # polynomial transform train and test inputs
        poly = PolynomialFeatures(degree)
        X_train_poly = poly.fit_transform(X_train[comb])
        X_test_poly = poly.fit_transform(X_test[comb])

        # fit and predict train and test
        poly_reg.fit(X_train_poly, y_train)
        y_pred_train = poly_reg.predict(X_train_poly)
        y_pred_test = poly_reg.predict(X_test_poly)

        # log training and testing metrics to list
        log_csv.append(
            [degree, comb,
            mean_squared_error(y_train, y_pred_train),  # train MSE
            mean_squared_error(y_test, y_pred_test),  # test MSE
            r2_score(y_train, y_pred_train),  # train r^2
            r2_score(y_test, y_pred_test)]  # test r^2
        )

Degree 1:   0%|          | 0/14892 [00:00<?, ?it/s]

Degree 2:   0%|          | 0/14892 [00:00<?, ?it/s]

Degree 3:   0%|          | 0/14892 [00:00<?, ?it/s]

Degree 4:   0%|          | 0/14892 [00:00<?, ?it/s]

  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, 

In [4]:
log_csv_sorted = sorted(log_csv, key=lambda l: l[-1], reverse=True)  # sort by max testing r^2
# convert to dataframe and write to file
log_df = pd.DataFrame(log_csv_sorted, columns=["degree", "feature_combination", "train_MSE",
                                               "test_MSE", "train_r^2", "test_r^2"])
log_df.to_csv("polyreg_results.csv", sep="|", index=False)