In [1]:
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import xarray as xr
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, explained_variance_score, r2_score
from sklearn.ensemble import RandomForestRegressor
from math import sqrt
from flaml import AutoML
import xgboost as xgb
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline

xr.set_options(display_style='html')
%matplotlib inline
%config InlineBackend.figure_format = 'retina' 

In [2]:
file_path = 'combined_dataset.nc'
combined_dataset = xr.open_dataset(file_path)

Cannot find the ecCodes library


In [3]:
df = combined_dataset.to_dataframe().reset_index()

In [4]:
clean_df = df.dropna()

In [5]:
df = clean_df

In [6]:
#deal with time information
df['time'] = df['time'].astype(str)
df['time'] = pd.to_datetime(df['time'])

In [7]:
#Split time to year, month, day of year
df['year'] = df['time'].dt.year
df['month'] = df['time'].dt.month
df['dayofyear'] = df['time'].dt.dayofyear

In [8]:
df['WindSpeed'] = np.sqrt(df['UBOT']**2 + df['VBOT']**2)

In [9]:
#water_vapor_content
P = 101325 
R = 287.05
df['WaterVapor'] =  (df['QBOT'] * P) / ( R * df['TREFHT'])

In [10]:
#heat_flux
rho = 1.225 
Cp = 1005 
df['HeatFlux'] = rho * Cp * df['TREFHT'] * df['WindSpeed']

In [11]:
#split train and test
train = df[df['year'] < 2040]
valid = df[(df['year'] >= 2040) & (df['year'] < 2050)]
test = df[df['year'] >= 2050]
#test = test[test['time'].dt.year < 2030]

In [12]:
#train and X_test
X_train = train.drop(['TREFMXAV_U','source','time'], axis=1) 
y_train = train['TREFMXAV_U']
X_valid = valid.drop(['TREFMXAV_U','source','time'], axis=1) 
y_valid = valid['TREFMXAV_U']
X_test = test.drop(['TREFMXAV_U','source','time'], axis=1) 
y_test = test['TREFMXAV_U']

In [13]:
scaler = StandardScaler()
pipeline = make_pipeline(scaler, LinearRegression())

In [14]:
pipeline.fit(X_train, y_train)

y_pred_lin = pipeline.predict(X_test)

rmse_lin = np.sqrt(mean_squared_error(y_test, y_pred_lin))
print("Linear Regression RMSE:", rmse_lin)

Linear Regression RMSE: 1.2882475166898701


In [None]:
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
xgb_model.fit(X_train, y_train)

y_pred_xgb = xgb_model.predict(X_test)

rmse_xgb = np.sqrt(mean_squared_error(y_test, y_pred_xgb))
print("XGBoost RMSE:", rmse_xgb)

In [None]:
print("Linear Regression RMSE vs XGBoost RMSE")
print("Linear Regression RMSE:", rmse_lin)
print("XGBoost RMSE:", rmse_xgb)