In [45]:
# Importing essential libraries.
from data_imports import *
from data_clean import data

In [46]:
# Importing machine-learning libraries.
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [47]:
# Assigning cleaned dataframe to a variable.
clean_df = data()

In [48]:
# Seperating the dataframes based on missing O3 values and assinging to variables.
df_missing = clean_df[clean_df['O3_ppbV'].isnull()]
df_not_missing = clean_df.dropna(subset = ['O3_ppbV'])

In [49]:
# Seperating features and target variable.
X = df_not_missing.drop(columns = ['O3_ppbV'])
y = df_not_missing['O3_ppbV']

In [50]:
# Splitting the data into train and testing subsets.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

In [51]:
# Training the model.
clf = HistGradientBoostingRegressor(random_state = 1)
clf.fit(X_train, y_train)

In [52]:
# Making predicts on the test set.
y_pred = clf.predict(X_test)

In [53]:
# Calculating regression metrics and assigning to a variable.
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [54]:
# Print regression metrics.
print(f'MSE: {mse}')
print(f'RMSE: {rmse}')
print(f'MAE: {mae}')
print(f'R2: {r2}')

MSE: 12.02466727469259
RMSE: 3.46766020173439
MAE: 2.6549757855609784
R2: 0.7795779725585423


In [55]:
def HGBR(input_data, var):
    # Seperating the dataframes based on missing O3 values and assinging to variables.
    df_missing = input_data[input_data[var].isnull()]
    df_not_missing = input_data.dropna(subset = [var])
    
    # Seperating features and target variable.
    X = df_not_missing.drop(columns = [var])
    y = df_not_missing[var]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)
    
    clf = HistGradientBoostingRegressor(random_state = 1)
    clf.fit(X_train, y_train)
    
    y_pred = clf.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    rmse = mse ** 0.5
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    print(f'MSE: {mse}')
    print(f'RMSE: {rmse}')
    print(f'MAE: {mae}')
    print(f'R2: {r2}')

In [57]:
HGBR(clean_df, 'O3_ppbV')

MSE: 43.405867019384075
RMSE: 6.5883129114655805
MAE: 4.542157812887695
R2: 0.8084851603912986
