In [1]:
import pandas as pd
import numpy as np
from netCDF4 import Dataset, num2date
import nctoolkit as nc
import matplotlib.pyplot as plt


nctoolkit is using Climate Data Operators version 2.3.0


In [2]:
from plot_functions import data_manipulation
#data_path  = '/Users/charliecameron/CodingHub/Uni/Data_driven_processes/Data_modelling/data_sets/data_aus.nc'
data_path = '/Users/charliecameron/CodingHub/Uni/Data_driven_processes/Data_modelling/rainfall_software/DATA/data_alice_springs.nc'
#Now we can create an instance using the data_manipulation class
data_class = data_manipulation(data_path)


In [3]:
print(data_class.metadata['lon'])

print(data_class.metadata['lat'])

data = data_class.data

[133.  133.1 133.2 133.3 133.4 133.5 133.6 133.7 133.8 133.9 134. ]
[-23.  -23.1 -23.2 -23.3 -23.4 -23.5 -23.6 -23.7 -23.8 -23.9 -24. ]


In [7]:
rainfall_data = data_class.data.variables['tp'][:]


print(rainfall_data.shape)


(876, 11, 11)


In [9]:
data_class.data.variables['latitude'][:]

masked_array(data=[-23. , -23.1, -23.2, -23.3, -23.4, -23.5, -23.6, -23.7,
                   -23.8, -23.9, -24. ],
             mask=False,
       fill_value=1e+20,
            dtype=float32)

In [16]:
data_class.metadata['dates']

masked_array(data=[cftime.DatetimeGregorian(1950, 1, 1, 0, 0, 0, 0, has_year_zero=False),
                   cftime.DatetimeGregorian(1950, 2, 1, 0, 0, 0, 0, has_year_zero=False),
                   cftime.DatetimeGregorian(1950, 3, 1, 0, 0, 0, 0, has_year_zero=False),
                   cftime.DatetimeGregorian(1950, 4, 1, 0, 0, 0, 0, has_year_zero=False),
                   cftime.DatetimeGregorian(1950, 5, 1, 0, 0, 0, 0, has_year_zero=False),
                   cftime.DatetimeGregorian(1950, 6, 1, 0, 0, 0, 0, has_year_zero=False),
                   cftime.DatetimeGregorian(1950, 7, 1, 0, 0, 0, 0, has_year_zero=False),
                   cftime.DatetimeGregorian(1950, 8, 1, 0, 0, 0, 0, has_year_zero=False),
                   cftime.DatetimeGregorian(1950, 9, 1, 0, 0, 0, 0, has_year_zero=False),
                   cftime.DatetimeGregorian(1950, 10, 1, 0, 0, 0, 0, has_year_zero=False),
                   cftime.DatetimeGregorian(1950, 11, 1, 0, 0, 0, 0, has_year_zero=False),
        

In [82]:
big_df_list = []
from datetime import datetime

big_df = pd.DataFrame(columns=['tp', 'date', 'lat', 'lon'])  # Ensure big_df is initialized

for i in range(11):
    for j in range(11):             
        temp_df = pd.DataFrame({
            'tp': rainfall_data[:,i,j],
            'date': data_class.metadata['dates'],
            'lat': data_class.data.variables['latitude'][i],
            'lon': data_class.data.variables['longitude'][j]
        })
        big_df_list.append(temp_df)  # Append temp_df to big_df

big_df = pd.concat(big_df_list,ignore_index=True)

big_df.head()
big_df['month'] = pd.to_datetime(big_df['date'].astype('str')).dt.month
big_df['year'] = pd.to_datetime(big_df['date'].astype('str')).dt.year

In [132]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

x_cols = ['year','month','lon','lat']
y_cols = ['tp']

X_data = big_df[x_cols]
Y_data = big_df[y_cols]


x_train, x_test, y_train, y_test = train_test_split(X_data,Y_data,test_size=0.2,random_state=0,shuffle = True)


In [133]:
model_1 = RandomForestRegressor(n_estimators=100,random_state=0)
model_1.fit(x_train,y_train)

A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


In [141]:
from sklearn.metrics import mean_squared_error
y_pred = model_1.predict(x_test)

# Calculate the mean absolute error
mse = mean_squared_error(y_test, y_pred)

print(f"Mean Absolute Error: {mse}")

print(mse/np.mean(y_test))
print(np.mean(y_test))

Mean Absolute Error: 2.6898173864028544e-09
3.4612220947504e-06
0.0007771293816951166


In [142]:
fi = model_1.feature_importances_
#The impurity-based feature importances
print(fi)
big_df.head()

[0.69397417 0.25938019 0.01619574 0.03044989]


Unnamed: 0,tp,date,lat,lon,month,year,Cyclic
0,0.0001758273,1950-01-01 00:00:00,-23.0,133.0,1,1950,0.8660254
1,0.001707346,1950-02-01 00:00:00,-23.0,133.0,2,1950,0.5
2,0.000378211,1950-03-01 00:00:00,-23.0,133.0,3,1950,6.123234000000001e-17
3,9.355544e-07,1950-04-01 00:00:00,-23.0,133.0,4,1950,-0.5
4,0.0008622825,1950-05-01 00:00:00,-23.0,133.0,5,1950,-0.8660254


In [143]:
df_cyclic = big_df

df_cyclic['Cyclic'] = df_cyclic['month'].apply(lambda x: np.cos(2*np.pi*x/12))

In [148]:
x_cols = ['Cyclic','year','lon','lat']
y_cols = ['tp']

X_data = big_df[x_cols]
Y_data = big_df[y_cols]


x_train, x_test, y_train, y_test = train_test_split(X_data,Y_data,test_size=0.2,random_state=0,shuffle = True)

In [149]:
model_2 = RandomForestRegressor(n_estimators=100,random_state=0)
model_2.fit(x_train,y_train)

A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


In [150]:
f = model_2.feature_importances_
print(f)
#The impurity-based feature importances

[0.21857229 0.48387114 0.14662048 0.15093608]


In [151]:
y_pred = model_2.predict(x_test)

# Calculate the mean absolute error
mse = mean_squared_error(y_test, y_pred)

print(f"Mean Absolute Error: {mse}")

print(mse/np.mean(y_test))
print(np.mean(y_test))

Mean Absolute Error: 1.6425604905516207e-06
0.0021136255162155612
0.0007771293816951166
