Start with EDA of the dataset

In [None]:
import netCDF4

file_path = 'Intern Dataset/Data/1-Abha-Aseer-Crude.nc'

with netCDF4.Dataset(file_path, 'r') as dataset:
    variables = dataset.variables.keys()
    print("Variables in the NetCDF file:")
    for var in variables:
        print(var)
    if '1-Abha-Aseer-Crude' in dataset.variables:
        var = dataset.variables['1-Abha-Aseer-Crude']
        print("\nDetails of '1-Abha-Aseer-Crude':")
        print(f"  Dimensions: {var.dimensions}")
        print(f"  Shape: {var.shape}")
        print(f"  Data type: {var.dtype}")
        print(f"  Sample data (first few elements): {var[366]}")
  

In [None]:
import netCDF4

file_path = 'Intern Dataset/Data/1-Abha-Aseer-Crude.nc'

with netCDF4.Dataset(file_path, 'r') as dataset:
    variables = dataset.variables
    time_var = variables['time']
    print("Time variable:")
    print(time_var)

In [None]:
import os

directory = 'Intern Dataset/Data'
files = os.listdir(directory)
netcdf_files = [f for f in files if f.endswith('.nc')]
total_files = len(netcdf_files)

print(f"Total number of NetCDF files: {total_files}")


In [None]:
import os
import shutil
import random

directory = 'Intern Dataset/Data/Data'
train_dir = 'Intern Dataset/train'
valid_dir = 'Intern Dataset/valid'
test_dir = 'Intern Dataset/test'

os.makedirs(train_dir, exist_ok = True)
os.makedirs(valid_dir, exist_ok = True)
os.makedirs(test_dir, exist_ok = True)

files = os.listdir(directory)
netcdf_files = [f for f in files if f.endswith('.nc')]

random.shuffle(netcdf_files)

total_files = len(netcdf_files)
train_size = int(total_files * 0.65)
valid_size = int(total_files * 0.15)

train_files = netcdf_files[:train_size]
valid_files = netcdf_files[train_size:train_size + valid_size]
test_files = netcdf_files[train_size + valid_size:]

def move_files(file_list, target_directory):
    for file in file_list:
        shutil.move(os.path.join(directory, file), os.path.join(target_directory, file))

move_files(train_files, train_dir)
move_files(valid_files, valid_dir)
move_files(test_files, test_dir)

print(f"Moved {len(train_files)} files to train.")
print(f"Moved {len(valid_files)} files to valid.")
print(f"Moved {len(test_files)} files to test.")


In [None]:
import numpy as np
from netCDF4 import Dataset as NetCDFDataset
import os

class CustomInput:
    
    def __init__(self, file_path):
        self.ncfile = NetCDFDataset(file_path, 'r')
        self.variable_name = os.path.basename(file_path).replace('.nc', '')
        self.data = self.ncfile.variables[self.variable_name]

    def __len__(self):
        return 366

    def __getitem__(self, idx):
        if 0 <= idx < 366:
            
            xco2_arr = self.data[idx]
            no2_arr = self.data[idx + 366]
            u_arr = self.data[idx + 732]
            v_arr = self.data[idx + 1098]
            emiss_arr = self.data[1464]
            mean_emiss = np.mean(emiss_arr)
            outputs = np.full((366,), mean_emiss, dtype = np.float32)
            inputs = np.stack([xco2_arr, no2_arr, u_arr, v_arr], axis = -1)
            min_val = inputs.min(axis = (0, 1), keepdims = True)
            max_val = inputs.max(axis = (0, 1), keepdims = True)
            range_val = max_val - min_val
            range_val[range_val == 0] = 1  
            inputs = (inputs - min_val) / range_val
            
            return inputs, outputs[idx]

    def __del__(self):
        self.ncfile.close()
        del self.data
        del self.ncfile
    
def load_datasets(file_paths):
    all_inputs = []
    for file_path in file_paths:
        dataset = CustomInput(file_path)
        for idx in range(len(dataset)):
            inputs, outputs = dataset[idx]
            if inputs is not None and inputs.shape == (64, 64, 4) and outputs != 0:
                all_inputs.append(inputs)
        del dataset
    return np.array(all_inputs)

train_set = 'Intern Dataset/train'
valid_set = 'Intern Dataset/valid'
test_set = 'Intern Dataset/test'

#train_files = [os.path.join(train_set, f) for f in os.listdir(train_set) if f.endswith('.nc')]
valid_files = [os.path.join(valid_set, f) for f in os.listdir(valid_set) if f.endswith('.nc')]
#test_files = [os.path.join(test_set, f) for f in os.listdir(test_set) if f.endswith('.nc')]

#train_inputs = load_datasets(train_files)
valid_inputs = load_datasets(valid_files)
#test_inputs = load_datasets(test_files)

#print(f"Shape of train_inputs: {train_inputs.shape}") # (15738, 64, 64, 4)
print(f"Shape of valid_inputs: {valid_inputs.shape}") # (3660, 64, 64, 4)
#print(f"Shape of test_inputs: {test_inputs.shape}") # (4392, 64, 64, 4)

#np.save('train_inputs.npy', train_inputs)
np.save('valid_inputs.npy', valid_inputs)
#np.save('test_inputs.npy', test_inputs)


In [None]:
import numpy as np
from netCDF4 import Dataset as NetCDFDataset
import os

class CustomOutput:
    
    def __init__(self, file_path):
        self.ncfile = NetCDFDataset(file_path, 'r')
        self.variable_name = os.path.basename(file_path).replace('.nc', '')
        self.data = self.ncfile.variables[self.variable_name]
        
    def __len__(self):
        return 366

    def __getitem__(self, idx):
        if 0 <= idx < 366:
            emiss_arr = self.data[1464]
            mean_emiss = np.mean(emiss_arr)
            print(mean_emiss)
            #outputs = np.full((366,), mean_emiss, dtype = np.float32)
            
            #return outputs[idx]
            return mean_emiss
    def __del__(self):
        self.ncfile.close()
        del self.data


def load_outputs(file_paths):
    all_outputs = []
    for file_path in file_paths:
        dataset = CustomOutput(file_path)
        for idx in range(len(dataset)):
            output = dataset[idx]
            all_outputs.append(output)
        #    if output != 0:
        #        all_outputs.append(output)
        del dataset
    return np.array(all_outputs)

#train_output_set = 'Intern Dataset/train'
#valid_output_set = 'Intern Dataset/valid'
test_output_set = 'Intern Dataset/test'

#train_output_files = [os.path.join(train_output_set, f) for f in os.listdir(train_output_set) if f.endswith('.nc')]
#valid_output_files = [os.path.join(valid_output_set, f) for f in os.listdir(valid_output_set) if f.endswith('.nc')]
test_output_files = [os.path.join(test_output_set, f) for f in os.listdir(test_output_set) if f.endswith('.nc')]

#train_outputs = load_outputs(train_output_files)
#valid_outputs = load_outputs(valid_output_files)
test_outputs = load_outputs(test_output_files)

#del train_output_files, valid_output_files, test_output_files

#print(f"Shape of train_outputs: {train_outputs.shape}")
#print(f"Shape of valid_outputs: {valid_outputs.shape}")
print(f"Shape of test_outputs: {test_outputs.shape}")

#np.save('train_outputs.npy', train_outputs)
#np.save('valid_outputs.npy', valid_outputs)
#np.save('test_outputs.npy', test_outputs)

#del train_outputs, valid_outputs, test_outputs


In [None]:
import numpy as np

train_outputs = np.load('train_comb_outputs.npy')
valid_outputs = np.load('valid_comb_outputs.npy')
test_outputs = np.load('test_comb_outputs.npy')

def compute_statistics(data):
    
    
    total_min = np.min(data)
    total_max = np.max(data) 
    total_range = total_max - total_min
    total_mean = np.mean(data)
    total_median = np.median(data)
    
    return {
        
        'total_min': total_min,
        'total_max': total_max,
        'total_range': total_range,
        'total_mean': total_mean,
        'total_median': total_median
    }

train_stats = compute_statistics(train_outputs)
valid_stats = compute_statistics(valid_outputs)
test_stats = compute_statistics(test_outputs)

print("Train Outputs Statistics:")
for key, value in train_stats.items():
    print(f"{key}: {value}")

print("\nValidation Outputs Statistics:")
for key, value in valid_stats.items():
    print(f"{key}: {value}")

print("\nTest Outputs Statistics:")
for key, value in test_stats.items():
    print(f"{key}: {value}")
'''
Saudi:
Train Outputs Statistics:
total_min: 0.016926101
total_max: 42.4713798
total_range: 42.454453699
total_mean: 2.8974418592431697
total_median: 1.3505546285

Validation Outputs Statistics:
total_min: 0.017671046
total_max: 28.06594092
total_range: 28.048269874
total_mean: 3.356000043450273
total_median: 2.8229551075

Test Outputs Statistics:
total_min: 0.065269343
total_max: 22.80542092
total_range: 22.740151577
total_mean: 2.0433333689961293
total_median: 1.117680595

Combined:
Train Outputs Statistics:
total_min: 0.016926101
total_max: 42.4713798
total_range: 42.454453699
total_mean: 9.230586244294928
total_median: 7.8671523105

Validation Outputs Statistics:
total_min: 0.017671046
total_max: 30.11400032043457
total_range: 30.09632927443457
total_mean: 8.660432125355227
total_median: 5.810635911

Test Outputs Statistics:
total_min: 0.065269343
total_max: 30.11400032043457
total_range: 30.04873097743457
total_mean: 7.220526309503236
total_median: 2.0419865755
'''


In [None]:
import numpy as np
import matplotlib.pyplot as plt
import os

train_outputs = np.load('train_comb_outputs.npy')
valid_outputs = np.load('valid_comb_outputs.npy')
test_outputs = np.load('test_comb_outputs.npy')

bins = np.concatenate([np.arange(0, 5.5, 0.5), np.arange(5, 25, 2), [np.inf]])
output_dir = 'Figures/Intern'
os.makedirs(output_dir, exist_ok = True)

def plot_histogram(data, filename, title, bins):
    plt.figure(figsize = (12, 6))
    plt.hist(data, bins = bins, edgecolor = 'black', alpha = 0.7)
    plt.title(title)
    plt.xlabel('Values')
    plt.ylabel('Frequency')
    plt.grid(True)
    plt.savefig(filename)
    plt.show()

train_filename = os.path.join(output_dir, 'train_emiss_dist_comb.png')
valid_filename = os.path.join(output_dir, 'valid_emiss_dist_comb.png')
test_filename = os.path.join(output_dir, 'test_emiss_dist_comb.png')

plot_histogram(train_outputs.flatten(), train_filename, 'Histogram of Train Outputs', bins)
plot_histogram(valid_outputs.flatten(), valid_filename, 'Histogram of Valid Outputs', bins)
plot_histogram(test_outputs.flatten(), test_filename, 'Histogram of Test Outputs', bins)


In [None]:
import numpy as np

train_outputs = np.load('train_comb_outputs.npy')
valid_outputs = np.load('valid_comb_outputs.npy')
test_outputs = np.load('test_comb_outputs.npy')

bins = np.concatenate([np.arange(0, 5.5, 0.5), np.arange(5, 25, 2), [np.inf]])

def calculate_percentage(data, bins):
    counts, _ = np.histogram(data, bins = bins)
    total_count = np.sum(counts)
    print(total_count)
    percentages = (counts / total_count) * 100
    return counts, percentages

train_counts, train_percentages = calculate_percentage(train_outputs.flatten(), bins)
valid_counts, valid_percentages = calculate_percentage(valid_outputs.flatten(), bins)
test_counts, test_percentages = calculate_percentage(test_outputs.flatten(), bins)

def stats(name, counts, percentages, bins):
    print(f'\n{name} Data Histogram:')
    for i in range(len(counts)):
        print(f'Bin range {bins[i]:.1f} to {bins[i+1]:.1f} (or {bins[i]} to {bins[i+1]}):')
        print(f'  Count: {counts[i]}')
        print(f'  Percentage: {percentages[i]:.2f}%')

stats('Train', train_counts, train_percentages, bins)
stats('Valid', valid_counts, valid_percentages, bins)
stats('Test', test_counts, test_percentages, bins)

'''
KSA:
Train Data Histogram:
Bin range 0.0 to 0.5 (or 0.0 to 0.5):
  Count: 4419
  Percentage: 28.08%
Bin range 0.5 to 1.0 (or 0.5 to 1.0):
  Count: 2082
  Percentage: 13.23%
Bin range 1.0 to 1.5 (or 1.0 to 1.5):
  Count: 1845
  Percentage: 11.72%
Bin range 1.5 to 2.0 (or 1.5 to 2.0):
  Count: 1146
  Percentage: 7.28%
Bin range 2.0 to 2.5 (or 2.0 to 2.5):
  Count: 784
  Percentage: 4.98%
Bin range 2.5 to 3.0 (or 2.5 to 3.0):
  Count: 634
  Percentage: 4.03%
Bin range 3.0 to 3.5 (or 3.0 to 3.5):
  Count: 583
  Percentage: 3.70%
Bin range 3.5 to 4.0 (or 3.5 to 4.0):
  Count: 549
  Percentage: 3.49%
Bin range 4.0 to 4.5 (or 4.0 to 4.5):
  Count: 431
  Percentage: 2.74%
Bin range 4.5 to 5.0 (or 4.5 to 5.0):
  Count: 391
  Percentage: 2.48%
Bin range 5.0 to 5.0 (or 5.0 to 5.0):
  Count: 0
  Percentage: 0.00%
Bin range 5.0 to 7.0 (or 5.0 to 7.0):
  Count: 920
  Percentage: 5.85%
Bin range 7.0 to 9.0 (or 7.0 to 9.0):
  Count: 707
  Percentage: 4.49%
Bin range 9.0 to 11.0 (or 9.0 to 11.0):
  Count: 507
  Percentage: 3.22%
Bin range 11.0 to 13.0 (or 11.0 to 13.0):
  Count: 295
  Percentage: 1.87%
Bin range 13.0 to 15.0 (or 13.0 to 15.0):
  Count: 172
  Percentage: 1.09%
Bin range 15.0 to 17.0 (or 15.0 to 17.0):
  Count: 78
  Percentage: 0.50%
Bin range 17.0 to 19.0 (or 17.0 to 19.0):
  Count: 54
  Percentage: 0.34%
Bin range 19.0 to 21.0 (or 19.0 to 21.0):
  Count: 36
  Percentage: 0.23%
Bin range 21.0 to 23.0 (or 21.0 to 23.0):
  Count: 25
  Percentage: 0.16%
Bin range 23.0 to inf (or 23.0 to inf):
  Count: 80
  Percentage: 0.51%

Valid Data Histogram:
Bin range 0.0 to 0.5 (or 0.0 to 0.5):
  Count: 626
  Percentage: 17.10%
Bin range 0.5 to 1.0 (or 0.5 to 1.0):
  Count: 311
  Percentage: 8.50%
Bin range 1.0 to 1.5 (or 1.0 to 1.5):
  Count: 278
  Percentage: 7.60%
Bin range 1.5 to 2.0 (or 1.5 to 2.0):
  Count: 308
  Percentage: 8.42%
Bin range 2.0 to 2.5 (or 2.0 to 2.5):
  Count: 195
  Percentage: 5.33%
Bin range 2.5 to 3.0 (or 2.5 to 3.0):
  Count: 167
  Percentage: 4.56%
Bin range 3.0 to 3.5 (or 3.0 to 3.5):
  Count: 209
  Percentage: 5.71%
Bin range 3.5 to 4.0 (or 3.5 to 4.0):
  Count: 185
  Percentage: 5.05%
Bin range 4.0 to 4.5 (or 4.0 to 4.5):
  Count: 213
  Percentage: 5.82%
Bin range 4.5 to 5.0 (or 4.5 to 5.0):
  Count: 193
  Percentage: 5.27%
Bin range 5.0 to 5.0 (or 5.0 to 5.0):
  Count: 0
  Percentage: 0.00%
Bin range 5.0 to 7.0 (or 5.0 to 7.0):
  Count: 567
  Percentage: 15.49%
Bin range 7.0 to 9.0 (or 7.0 to 9.0):
  Count: 267
  Percentage: 7.30%
Bin range 9.0 to 11.0 (or 9.0 to 11.0):
  Count: 91
  Percentage: 2.49%
Bin range 11.0 to 13.0 (or 11.0 to 13.0):
  Count: 32
  Percentage: 0.87%
Bin range 13.0 to 15.0 (or 13.0 to 15.0):
  Count: 13
  Percentage: 0.36%
Bin range 15.0 to 17.0 (or 15.0 to 17.0):
  Count: 3
  Percentage: 0.08%
Bin range 17.0 to 19.0 (or 17.0 to 19.0):
  Count: 1
  Percentage: 0.03%
Bin range 19.0 to 21.0 (or 19.0 to 21.0):
  Count: 0
  Percentage: 0.00%
Bin range 21.0 to 23.0 (or 21.0 to 23.0):
  Count: 0
  Percentage: 0.00%
Bin range 23.0 to inf (or 23.0 to inf):
  Count: 1
  Percentage: 0.03%

Test Data Histogram:
Bin range 0.0 to 0.5 (or 0.0 to 0.5):
  Count: 562
  Percentage: 12.80%
Bin range 0.5 to 1.0 (or 0.5 to 1.0):
  Count: 1387
  Percentage: 31.58%
Bin range 1.0 to 1.5 (or 1.0 to 1.5):
  Count: 828
  Percentage: 18.85%
Bin range 1.5 to 2.0 (or 1.5 to 2.0):
  Count: 541
  Percentage: 12.32%
Bin range 2.0 to 2.5 (or 2.0 to 2.5):
  Count: 291
  Percentage: 6.63%
Bin range 2.5 to 3.0 (or 2.5 to 3.0):
  Count: 122
  Percentage: 2.78%
Bin range 3.0 to 3.5 (or 3.0 to 3.5):
  Count: 103
  Percentage: 2.35%
Bin range 3.5 to 4.0 (or 3.5 to 4.0):
  Count: 62
  Percentage: 1.41%
Bin range 4.0 to 4.5 (or 4.0 to 4.5):
  Count: 51
  Percentage: 1.16%
Bin range 4.5 to 5.0 (or 4.5 to 5.0):
  Count: 39
  Percentage: 0.89%
Bin range 5.0 to 5.0 (or 5.0 to 5.0):
  Count: 0
  Percentage: 0.00%
Bin range 5.0 to 7.0 (or 5.0 to 7.0):
  Count: 96
  Percentage: 2.19%
Bin range 7.0 to 9.0 (or 7.0 to 9.0):
  Count: 109
  Percentage: 2.48%
Bin range 9.0 to 11.0 (or 9.0 to 11.0):
  Count: 84
  Percentage: 1.91%
Bin range 11.0 to 13.0 (or 11.0 to 13.0):
  Count: 62
  Percentage: 1.41%
Bin range 13.0 to 15.0 (or 13.0 to 15.0):
  Count: 34
  Percentage: 0.77%
Bin range 15.0 to 17.0 (or 15.0 to 17.0):
  Count: 14
  Percentage: 0.32%
Bin range 17.0 to 19.0 (or 17.0 to 19.0):
  Count: 4
  Percentage: 0.09%
Bin range 19.0 to 21.0 (or 19.0 to 21.0):
  Count: 2
  Percentage: 0.05%
Bin range 21.0 to 23.0 (or 21.0 to 23.0):
  Count: 1
  Percentage: 0.02%
Bin range 23.0 to inf (or 23.0 to inf):
  Count: 0
  Percentage: 0.00%

Combined:
Train Data Histogram:
Bin range 0.0 to 0.5 (or 0.0 to 0.5):
  Count: 4419
  Percentage: 15.61%
Bin range 0.5 to 1.0 (or 0.5 to 1.0):
  Count: 2082
  Percentage: 7.35%
Bin range 1.0 to 1.5 (or 1.0 to 1.5):
  Count: 1845
  Percentage: 6.52%
Bin range 1.5 to 2.0 (or 1.5 to 2.0):
  Count: 1146
  Percentage: 4.05%
Bin range 2.0 to 2.5 (or 2.0 to 2.5):
  Count: 784
  Percentage: 2.77%
Bin range 2.5 to 3.0 (or 2.5 to 3.0):
  Count: 634
  Percentage: 2.24%
Bin range 3.0 to 3.5 (or 3.0 to 3.5):
  Count: 583
  Percentage: 2.06%
Bin range 3.5 to 4.0 (or 3.5 to 4.0):
  Count: 549
  Percentage: 1.94%
Bin range 4.0 to 4.5 (or 4.0 to 4.5):
  Count: 431
  Percentage: 1.52%
Bin range 4.5 to 5.0 (or 4.5 to 5.0):
  Count: 391
  Percentage: 1.38%
Bin range 5.0 to 5.0 (or 5.0 to 5.0):
  Count: 0
  Percentage: 0.00%
Bin range 5.0 to 7.0 (or 5.0 to 7.0):
  Count: 920
  Percentage: 3.25%
Bin range 7.0 to 9.0 (or 7.0 to 9.0):
  Count: 870
  Percentage: 3.07%
Bin range 9.0 to 11.0 (or 9.0 to 11.0):
  Count: 1248
  Percentage: 4.41%
Bin range 11.0 to 13.0 (or 11.0 to 13.0):
  Count: 1820
  Percentage: 6.43%
Bin range 13.0 to 15.0 (or 13.0 to 15.0):
  Count: 2068
  Percentage: 7.30%
Bin range 15.0 to 17.0 (or 15.0 to 17.0):
  Count: 2230
  Percentage: 7.88%
Bin range 17.0 to 19.0 (or 17.0 to 19.0):
  Count: 1991
  Percentage: 7.03%
Bin range 19.0 to 21.0 (or 19.0 to 21.0):
  Count: 1732
  Percentage: 6.12%
Bin range 21.0 to 23.0 (or 21.0 to 23.0):
  Count: 1126
  Percentage: 3.98%
Bin range 23.0 to inf (or 23.0 to inf):
  Count: 1447
  Percentage: 5.11%

Valid Data Histogram:
Bin range 0.0 to 0.5 (or 0.0 to 0.5):
  Count: 626
  Percentage: 10.50%
Bin range 0.5 to 1.0 (or 0.5 to 1.0):
  Count: 311
  Percentage: 5.21%
Bin range 1.0 to 1.5 (or 1.0 to 1.5):
  Count: 278
  Percentage: 4.66%
Bin range 1.5 to 2.0 (or 1.5 to 2.0):
  Count: 308
  Percentage: 5.16%
Bin range 2.0 to 2.5 (or 2.0 to 2.5):
  Count: 195
  Percentage: 3.27%
Bin range 2.5 to 3.0 (or 2.5 to 3.0):
  Count: 167
  Percentage: 2.80%
Bin range 3.0 to 3.5 (or 3.0 to 3.5):
  Count: 209
  Percentage: 3.50%
Bin range 3.5 to 4.0 (or 3.5 to 4.0):
  Count: 185
  Percentage: 3.10%
Bin range 4.0 to 4.5 (or 4.0 to 4.5):
  Count: 213
  Percentage: 3.57%
Bin range 4.5 to 5.0 (or 4.5 to 5.0):
  Count: 193
  Percentage: 3.24%
Bin range 5.0 to 5.0 (or 5.0 to 5.0):
  Count: 0
  Percentage: 0.00%
Bin range 5.0 to 7.0 (or 5.0 to 7.0):
  Count: 567
  Percentage: 9.51%
Bin range 7.0 to 9.0 (or 7.0 to 9.0):
  Count: 291
  Percentage: 4.88%
Bin range 9.0 to 11.0 (or 9.0 to 11.0):
  Count: 232
  Percentage: 3.89%
Bin range 11.0 to 13.0 (or 11.0 to 13.0):
  Count: 323
  Percentage: 5.42%
Bin range 13.0 to 15.0 (or 13.0 to 15.0):
  Count: 357
  Percentage: 5.99%
Bin range 15.0 to 17.0 (or 15.0 to 17.0):
  Count: 394
  Percentage: 6.61%
Bin range 17.0 to 19.0 (or 17.0 to 19.0):
  Count: 373
  Percentage: 6.25%
Bin range 19.0 to 21.0 (or 19.0 to 21.0):
  Count: 310
  Percentage: 5.20%
Bin range 21.0 to 23.0 (or 21.0 to 23.0):
  Count: 198
  Percentage: 3.32%
Bin range 23.0 to inf (or 23.0 to inf):
  Count: 234
  Percentage: 3.92%

Test Data Histogram:
Bin range 0.0 to 0.5 (or 0.0 to 0.5):
  Count: 562
  Percentage: 8.39%
Bin range 0.5 to 1.0 (or 0.5 to 1.0):
  Count: 1387
  Percentage: 20.71%
Bin range 1.0 to 1.5 (or 1.0 to 1.5):
  Count: 828
  Percentage: 12.37%
Bin range 1.5 to 2.0 (or 1.5 to 2.0):
  Count: 541
  Percentage: 8.08%
Bin range 2.0 to 2.5 (or 2.0 to 2.5):
  Count: 291
  Percentage: 4.35%
Bin range 2.5 to 3.0 (or 2.5 to 3.0):
  Count: 122
  Percentage: 1.82%
Bin range 3.0 to 3.5 (or 3.0 to 3.5):
  Count: 103
  Percentage: 1.54%
Bin range 3.5 to 4.0 (or 3.5 to 4.0):
  Count: 62
  Percentage: 0.93%
Bin range 4.0 to 4.5 (or 4.0 to 4.5):
  Count: 51
  Percentage: 0.76%
Bin range 4.5 to 5.0 (or 4.5 to 5.0):
  Count: 39
  Percentage: 0.58%
Bin range 5.0 to 5.0 (or 5.0 to 5.0):
  Count: 0
  Percentage: 0.00%
Bin range 5.0 to 7.0 (or 5.0 to 7.0):
  Count: 96
  Percentage: 1.43%
Bin range 7.0 to 9.0 (or 7.0 to 9.0):
  Count: 142
  Percentage: 2.12%
Bin range 9.0 to 11.0 (or 9.0 to 11.0):
  Count: 240
  Percentage: 3.58%
Bin range 11.0 to 13.0 (or 11.0 to 13.0):
  Count: 329
  Percentage: 4.91%
Bin range 13.0 to 15.0 (or 13.0 to 15.0):
  Count: 403
  Percentage: 6.02%
Bin range 15.0 to 17.0 (or 15.0 to 17.0):
  Count: 397
  Percentage: 5.93%
Bin range 17.0 to 19.0 (or 17.0 to 19.0):
  Count: 353
  Percentage: 5.27%
Bin range 19.0 to 21.0 (or 19.0 to 21.0):
  Count: 297
  Percentage: 4.44%
Bin range 21.0 to 23.0 (or 21.0 to 23.0):
  Count: 203
  Percentage: 3.03%
Bin range 23.0 to inf (or 23.0 to inf):
  Count: 250
  Percentage: 3.73%
'''


In [None]:
import numpy as np

#train_inputs = np.load('train_inputs.npy')
val_inputs = np.load('valid_inputs.npy')
#test_inputs = np.load('test_inputs.npy')

#print('Original train_inputs shape:', train_inputs.shape)
print('Original val_inputs shape:', val_inputs.shape)
#print('Original test_inputs shape:', test_inputs.shape)

num_days = 366
#site_count = train_inputs.shape[0] // num_days

#print('Number of sites:', site_count)

def reshape_data(data, num_days):
    num_sites = data.shape[0] // num_days
    reshaped_data = data.reshape(num_sites, num_days, 64, 64, 4)
    return reshaped_data

#train_inputs_reshaped = reshape_data(train_inputs, num_days)
val_inputs_reshaped = reshape_data(val_inputs, num_days)
#test_inputs_reshaped = reshape_data(test_inputs, num_days)

#print('Reshaped train_inputs shape:', train_inputs_reshaped.shape)
print('Reshaped val_inputs shape:', val_inputs_reshaped.shape)
#print('Reshaped test_inputs shape:', test_inputs_reshaped.shape)

#np.save('train_inputs.npy', train_inputs_reshaped)
np.save('valid_inputs.npy', val_inputs_reshaped)
#np.save('test_inputs.npy', test_inputs_reshaped)

In [None]:
import numpy as np

train_inputs = np.load('train_inputs.npy')
val_inputs = np.load('valid_inputs.npy')
test_inputs = np.load('test_inputs.npy')

print('Train inputs shape:', train_inputs.shape)
print('Validation inputs shape:', val_inputs.shape)
print('Test inputs shape:', test_inputs.shape)

def compute_mean_per_site(data):
    mean_per_site = np.mean(data, axis = 1)
    return mean_per_site

train_mean = compute_mean_per_site(train_inputs)
val_mean = compute_mean_per_site(val_inputs)
test_mean = compute_mean_per_site(test_inputs)

print('Mean train inputs shape:', train_mean.shape)
print('Mean validation inputs shape:', val_mean.shape)
print('Mean test inputs shape:', test_mean.shape)

np.save('train_inputs.npy', train_mean)
np.save('valid_inputs.npy', val_mean)
np.save('test_inputs.npy', test_mean)


In [None]:
import numpy as np

#train_outputs = np.load('train_outputs.npy')
valid_outputs = np.load('valid_outputs.npy')
#test_outputs = np.load('test_outputs.npy')

#print('Train outputs shape:', train_outputs.shape)
print('Validation outputs shape:', valid_outputs.shape)
#print('Test outputs shape:', test_outputs.shape)

def reshape_to_sites(data, num_days):
    num_samples = data.shape[0]
    num_sites = num_samples // num_days
    reshaped_data = data.reshape(num_sites, num_days)
    return reshaped_data

def aggregate_to_sites(data):
    aggregated_data = np.mean(data, axis = 1)
    return aggregated_data

num_days = 366

#train_outputs_sites = reshape_to_sites(train_outputs, num_days)
valid_outputs_sites = reshape_to_sites(valid_outputs, num_days)
#test_outputs_sites = reshape_to_sites(test_outputs, num_days)

#print('Reshaped train outputs shape:', train_outputs_sites.shape)
print('Reshaped validation outputs shape:', valid_outputs_sites.shape)
#print('Reshaped test outputs shape:', test_outputs_sites.shape)

#train_outputs_aggregated = aggregate_to_sites(train_outputs_sites)
valid_outputs_aggregated = aggregate_to_sites(valid_outputs_sites)
#test_outputs_aggregated = aggregate_to_sites(test_outputs_sites)

#print('Aggregated train outputs shape:', train_outputs_aggregated.shape)
print('Aggregated validation outputs shape:', valid_outputs_aggregated.shape)
#print('Aggregated test outputs shape:', test_outputs_aggregated.shape)

#np.save('train_outputs.npy', train_outputs_aggregated)
np.save('valid_outputs.npy', valid_outputs_aggregated)
#np.save('test_outputs.npy', test_outputs_aggregated)



In [None]:
import numpy as np

#train_inputs = np.load('train_inputs.npy')
valid_inputs = np.load('valid_inputs.npy')
#test_inputs = np.load('test_inputs.npy')

#print('Train inputs shape:', train_inputs.shape)
print('Validation inputs shape:', valid_inputs.shape)
#print('Test inputs shape:', test_inputs.shape)



In [None]:
import numpy as np
from tensorflow.keras.models import load_model
from tensorflow.keras.layers import Input, Conv2D, Conv2DTranspose, MaxPool2D, UpSampling2D, Concatenate, Dropout, BatchNormalization, Flatten, Dense
from tensorflow.keras.models import Model
import gc

def unetreg(input_shape, dropout_rate = 0.2):
    inputs = Input(input_shape)
    
    c1 = Conv2D(64, 3, activation='relu', padding='same', kernel_initializer='he_uniform')(inputs)
    c1 = BatchNormalization()(c1)
    c1 = Conv2D(64, 3, activation='relu', padding='same', kernel_initializer='he_uniform')(c1)
    c1 = BatchNormalization()(c1)
    d1 = Dropout(dropout_rate)(c1)
    p1 = MaxPool2D(pool_size=(2, 2))(d1)
    
    c2 = Conv2D(128, 3, activation='relu', padding='same', kernel_initializer='he_uniform')(p1)
    c2 = BatchNormalization()(c2)
    c2 = Conv2D(128, 3, activation='relu', padding='same', kernel_initializer='he_uniform')(c2)
    c2 = BatchNormalization()(c2)
    d2 = Dropout(dropout_rate)(c2)
    p2 = MaxPool2D(pool_size=(2, 2))(d2)
    
    c3 = Conv2D(256, 3, activation='relu', padding='same', kernel_initializer='he_uniform')(p2)
    c3 = BatchNormalization()(c3)
    c3 = Conv2D(256, 3, activation='relu', padding='same', kernel_initializer='he_uniform')(c3)
    c3 = BatchNormalization()(c3)
    d3 = Dropout(dropout_rate)(c3)
    p3 = MaxPool2D(pool_size=(2, 2))(d3)

    c4 = Conv2D(512, 3, activation='relu', padding='same', kernel_initializer='he_uniform')(p3)
    c4 = BatchNormalization()(c4)
    c4 = Conv2D(512, 3, activation='relu', padding='same', kernel_initializer='he_uniform')(c4)
    c4 = BatchNormalization()(c4)
    d4 = Dropout(dropout_rate)(c4)
    
    u5 = Conv2DTranspose(256, 3, activation='relu', padding='same', kernel_initializer='he_uniform')(d4)
    u5 = UpSampling2D((2, 2))(u5)
    concat5 = Concatenate()([u5, c3])
    c5 = Conv2D(256, 3, activation='relu', padding='same', kernel_initializer='he_uniform')(concat5)
    c5 = BatchNormalization()(c5)
    c5 = Conv2D(256, 3, activation='relu', padding='same', kernel_initializer='he_uniform')(c5)
    c5 = BatchNormalization()(c5)
    d5 = Dropout(dropout_rate)(c5)
    
    u6 = Conv2DTranspose(128, 3, activation='relu', padding='same', kernel_initializer='he_uniform')(d5)
    u6 = UpSampling2D((2, 2))(u6)
    concat6 = Concatenate()([u6, c2])
    c6 = Conv2D(128, 3, activation='relu', padding='same', kernel_initializer='he_uniform')(concat6)
    c6 = BatchNormalization()(c6)
    c6 = Conv2D(128, 3, activation='relu', padding='same', kernel_initializer='he_uniform')(c6)
    c6 = BatchNormalization()(c6)
    d6 = Dropout(dropout_rate)(c6)
    
    u7 = Conv2DTranspose(64, 3, activation='relu', padding='same', kernel_initializer='he_uniform')(d6)
    u7 = UpSampling2D((2, 2))(u7)
    concat7 = Concatenate()([u7, c1])
    c7 = Conv2D(64, 3, activation='relu', padding='same', kernel_initializer='he_uniform')(concat7)
    c7 = BatchNormalization()(c7)
    c7 = Conv2D(1, 3, activation='relu', padding='same', kernel_initializer='glorot_uniform')(c7)
    c7 = BatchNormalization()(c7)
    
    flat = Flatten()(c7)
    output = Dense(1, activation='linear')(flat)
    
    model = Model(inputs=inputs, outputs=output)
    
    return model

model = unetreg((64, 64, 4))
model.load_weights('saved_models/modelweights_new_lip.h5')


# average +- 5
train_inputs = np.load('test_inputs.npy')
train_outputs = np.load('test_outputs.npy')

def predict_emissions(inputs, model, batch_size = 8):
    num_sites = inputs.shape[0] # (num_sites, 366, 64, 64, 4) input shape
    days = inputs.shape[1]
    predictions = np.zeros((num_sites, days)) # num_sites, 366
    
    for site_idx in range(num_sites):
        for start_day in range(0, days, batch_size):
            end_day = min(start_day + batch_size, days)
            input_batch = inputs[site_idx, start_day:end_day, :, :, :]
            preds_batch = model.predict(input_batch)
            predictions[site_idx, start_day:end_day] = preds_batch.flatten()
        gc.collect()
    
    return predictions

def adjust_predictions(predictions, annual_emissions):
    adjusted_predictions = np.copy(predictions)
    for i in range(predictions.shape[0]): 
        mean_prediction = np.mean(predictions[i])
        if mean_prediction != 0:
            scale_factor = annual_emissions[i] / mean_prediction
            adjusted_predictions[i] *= scale_factor
    return adjusted_predictions

train_predictions = predict_emissions(train_inputs, model)
train_adjusted_predictions = adjust_predictions(train_predictions, train_outputs)
np.save('valid_outputs.npy', train_adjusted_predictions)

print("Done") # (total_test,)


In [None]:
import numpy as np
train_inputs = np.load('valid_inputs_box.npy')
train_outputs = np.load('train_inputs_box.npy')
#train = np.load('test_outputs_box.npy')

#tr = train_inputs.reshape(-1)
#va = train_outputs.reshape(-1)
#te = train.reshape(-1)

print("Shape : ", train_inputs.shape)
print("Shape : ", train_outputs.shape)

#train_inputs = train_inputs.reshape(-1, 64, 64, 4)
#train_outputs = train_outputs.reshape(-1)

#print("Shape : ", train_inputs.shape)
#print("Shape : ", train_outputs.shape)

#np.save('valid_outputs_box.npy', tr)
#np.save('train_outputs_box.npy', va)
#np.save('test_outputs_box.npy', te)

In [None]:
import numpy as np
from netCDF4 import Dataset

class CustomDataset:
    def __init__(self, file_path, variable = None):
        self.ncfile = Dataset(file_path, 'r')
        self.variable = variable

    def __len__(self):
        return self.ncfile.variables['xco2_noisy'].shape[0]

    def __getitem__(self, idx):
        xco2_read = self.ncfile.variables['xco2_noisy'][idx]
        u_read = self.ncfile.variables['u'][idx]
        v_read = self.ncfile.variables['v'][idx]
        emiss_read = self.ncfile.variables['emiss'][idx]
        
        xco2_arr = np.array(xco2_read).astype('float32')
        u_arr = np.array(u_read).astype('float32')
        v_arr = np.array(v_read).astype('float32')
        emiss_arr = np.array(emiss_read).astype('float32')
        
        if self.variable:
            var_read = self.ncfile.variables[self.variable][idx]
            var_arr = np.array(var_read).astype('float32')
            inputs = np.stack([xco2_arr, u_arr, v_arr, var_arr], axis = -1)
        else:
            inputs = np.stack([xco2_arr, u_arr, v_arr], axis = -1)
        
        min_val = inputs.min(axis = (0, 1), keepdims = True)
        max_val = inputs.max(axis = (0, 1), keepdims = True)
        max_val[max_val == min_val] = min_val[max_val == min_val] + 1
        inputs = (inputs - min_val) / (max_val - min_val)
        
        weights = np.array([0.0, 1.0, 0.0])
        weighted = np.round(np.sum(weights * emiss_arr), 3)
        outputs = np.array([weighted], dtype = np.float32)
        
        return inputs, outputs

    def __del__(self):
        self.ncfile.close()

def save_dataset_to_npy(dataset, input_file, output_file):
    num_samples = len(dataset)
    inputs_all = np.zeros((num_samples, 64, 64, 4 if dataset.variable else 3), dtype = np.float32)
    outputs_all = np.zeros((num_samples, 1), dtype = np.float32)
    
    for idx in range(num_samples):
        inputs, outputs = dataset[idx]
        inputs_all[idx] = inputs
        outputs_all[idx] = outputs
    
    np.save(input_file, inputs_all)
    np.save(output_file, outputs_all)

train_set = 'Datasets/data_paper_inv_pp/lippendorf/train_dataset.nc'
valid_set = 'Datasets/data_paper_inv_pp/lippendorf/valid_dataset.nc'
test_set = 'Datasets/data_paper_inv_pp/lippendorf/test_dataset.nc'

train_dataset = CustomDataset(train_set, variable = 'no2_noisy')
valid_dataset = CustomDataset(valid_set, variable = 'no2_noisy')
test_dataset = CustomDataset(test_set, variable = 'no2_noisy')

save_dataset_to_npy(train_dataset, 'train_inputs_lip.npy', 'train_outputs_lip.npy')
save_dataset_to_npy(valid_dataset, 'valid_inputs_lip.npy', 'valid_outputs_lip.npy')
save_dataset_to_npy(test_dataset, 'test_inputs_lip.npy', 'test_outputs_lip.npy')


In [None]:
import numpy as np

train_inputs = np.load('test_inputs.npy')
train_inputs_lip = np.load('test_inputs_lip.npy')
train_inputs_box = np.load('test_inputs_box.npy')
#train_inputs_lip = train_inputs_lip.flatten()
#train_inputs_box = train_inputs_box.flatten()

combined_inputs = np.concatenate([train_inputs, train_inputs_lip, train_inputs_box], axis = 0)

print("Shape of the combined array:", combined_inputs.shape)

np.save('test_comb_inputs.npy', combined_inputs)




CNN regression

In [None]:
import numpy as np
import os
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, LeakyReLU, Dropout, BatchNormalization, MaxPool2D, Flatten, Dense
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau

# (Number of samples, 64, 64, 4) xco2, no2, u, v
# emission rate co2
def model_arch(input_shape):
    model = Sequential()
    
    model.add(Conv2D(32, (3, 3), activation = "elu", strides = 1, input_shape = input_shape))
    model.add(Dropout(0.1))
    model.add(Conv2D(32, (3, 3), activation = "elu", strides = 1))
    model.add(MaxPool2D(pool_size = (2, 2), padding = "valid", strides = 2))
    model.add(BatchNormalization())
    model.add(Conv2D(32, (3, 3), activation = "elu", strides = 1))
    model.add(Dropout(0.2))
    model.add(Conv2D(64, (3, 3), activation = "elu", strides = 1))
    model.add(BatchNormalization())
    model.add(Conv2D(64, (3, 3), activation = "elu", strides = 1))
    model.add(Dropout(0.2))
    model.add(Conv2D(64, (3, 3), activation = "elu", strides = 1))
    model.add(MaxPool2D(pool_size = (2, 2), padding = "valid", strides = 2))
    model.add(Conv2D(64, (3, 3), activation = "elu", strides = 1))
    model.add(Dropout(0.2))
    model.add(Conv2D(64, (3, 3), activation = "elu", strides = 1))
    model.add(MaxPool2D(pool_size = (2, 2), padding = "valid", strides = 2))
    model.add(Flatten())
    model.add(Dense(1))
    model.add(LeakyReLU(alpha = 0.3))
    
    return model

def rmse(y_true, y_pred):
    mse = tf.reduce_mean(tf.square(y_true - y_pred))
    return tf.sqrt(mse)
    
train_inputs = np.load('train_comb_inputs.npy')
train_outputs = np.load('train_comb_outputs.npy')
val_inputs = np.load('valid_comb_inputs.npy')
val_outputs = np.load('valid_comb_outputs.npy')
test_inputs = np.load('test_comb_inputs.npy')
test_outputs = np.load('test_comb_outputs.npy')

checkpoint_dir = './saved_models/' 
os.makedirs(checkpoint_dir, exist_ok = True)

check = os.path.join(checkpoint_dir, 'cnn_reg_comb.h5')
check_call = ModelCheckpoint(filepath = check, save_weights_only = True, verbose = 1)

data_augmentation = ImageDataGenerator(rotation_range = 180, width_shift_range = 0.0, height_shift_range = 0.0, shear_range = 90, zoom_range = 0.2, horizontal_flip = True, vertical_flip = True, fill_mode = 'nearest')

model = model_arch(input_shape = (64, 64, 4))
# model.load_weights('saved_models/cnn_reg_comb.h5')
optimizer = Adam(learning_rate = 1e-3)
lr_check = ReduceLROnPlateau(monitor = 'val_loss', factor = 0.75, patience = 30, verbose = 1, min_delta = 5e-3, cooldown = 0, min_lr = 5e-6)
model.compile(optimizer = optimizer, loss = tf.keras.losses.Huber())
#tf.keras.losses.Huber()
model.summary()

history = model.fit(data_augmentation.flow(train_inputs, train_outputs, batch_size = 32, shuffle = True), epochs = 200, steps_per_epoch = len(train_outputs) // 32,
                    validation_data = (val_inputs, val_outputs), validation_batch_size = 32,
                    callbacks = [lr_check, check_call])

del train_inputs, train_outputs, val_inputs, val_outputs

test_loss = model.evaluate(test_inputs, test_outputs, batch_size = None)
print(f'Test Loss: {test_loss}')

predicted_emissions = []
true_emissions = []

for i in range(len(test_inputs)):
    inputs = np.expand_dims(test_inputs[i], axis = 0)
    prediction = model.predict(inputs)
    true_emissions.append(test_outputs[i])
    predicted_emissions.append(prediction)

true_emissions = np.array(true_emissions)
predicted_emissions = np.array(predicted_emissions)

del test_inputs, test_outputs

true_emissions = true_emissions.reshape(predicted_emissions.shape)

csv_file = 'emiss_comb_cnn.csv'
if os.path.exists(csv_file):
    df = pd.read_csv(csv_file)
    
#df = pd.DataFrame({'True Emissions': true_emissions.flatten(), 'Predicted Emissions': predicted_emissions.flatten()})
df['Pred Emiss 2'] = predicted_emissions.flatten()
df.to_csv('emiss_comb_cnn.csv', index = False) 


In [None]:
import pandas as pd

df = pd.read_csv('emiss_comb_cnn.csv')
# 
df['Average Emiss'] = ((df['Predicted Emissions'] + df['Pred Emiss 2'] + df['Pred Emiss 3'] + df['Pred Emiss 1'])/4).round(3) 

df['error'] = df['True Emissions'] - df['Average Emiss'] 

df['absolute_error'] = (abs(df['error'])).round(3)

df['relative_error'] = ((df['error'] / (df['True Emissions'] + 1e-15)) * 100).round(3)

df.to_csv('emiss_comb_cnn.csv', index = False)

In [None]:
import pandas as pd

df = pd.read_csv('emiss_comb_cnn.csv')

percentiles = df['absolute_error'].describe(percentiles = [0.25, 0.5, 0.75])
percentiles = percentiles.loc[['25%', '50%', '75%']].apply(abs) # 25% : 0.928, 50% : 1.502, 75% : 2.540 (All are in Mt/yr)
print(percentiles)


In [None]:
import pandas as pd

df = pd.read_csv('emiss_comb_cnn.csv')
df['abs_rel_error'] = abs(df['relative_error'])

percentiles = df['abs_rel_error'].describe(percentiles = [0.25, 0.5, 0.75])
percentiles = percentiles.loc[['25%', '50%', '75%']].apply(abs) # 25% : 18.208, 50% : 46.256, 75% : 148.138 (Unitless)
print(percentiles)


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

df = pd.read_csv('emiss_comb_cnn.csv')

plt.figure(figsize = (10, 6))
sns.kdeplot(df['absolute_error'], shade = True, color = 'b')
plt.title('Kernel Density Estimate of Absolute Error')
plt.xlabel('Absolute Error')
plt.ylabel('Density')
plt.xlim(0, 25)
plt.savefig('Figures/Intern/absolute_error_plot_comb.png', dpi = 300, bbox_inches = 'tight')
plt.show()

plt.figure(figsize = (10, 6))
sns.kdeplot(df['relative_error'], shade = True, color = 'b')
plt.title('Kernel Density Estimate of Relative Error')
plt.xlabel('Relative Error')
plt.ylabel('Density')
plt.xlim(-180, 180)
plt.savefig('Figures/Intern/relative_error_plot_comb.png', dpi = 300, bbox_inches = 'tight')
plt.show()


In [None]:
import pandas as pd

df = pd.read_csv('emiss_comb_cnn.csv')

abs = df['absolute_error']

in0_2 = (abs >= 0) & (abs <= 2)
in2_5 = (abs > 2) & (abs <= 5)
in5_10 = (abs > 5) & (abs <= 10)
ab10 = abs > 10
mean = round(abs.mean(), 3)
median = round(abs.median(), 3)
std = round(abs.std(), 3)

print("in between 0 and 2 : ", in0_2.sum()) # 4652
print("in between 2 and 5 : ", in2_5.sum()) # 1189
print("in between 5 and 10 : ", in5_10.sum()) # 708
print("above 10 : ", ab10.sum()) # 147
print("mean : ", mean) # 2.332
print("median : ", median) # 1.502
print("std : ", std) # 2.47


In [None]:
import pandas as pd

df = pd.read_csv('emiss_comb_cnn.csv')

rel = df['relative_error']

lessneg150 = (rel <= -150)
neg150_100 = (rel > -150) & (rel <= -100)
neg100_50 = (rel > -100) & (rel <= -50)
neg50_0 = (rel > -50) & (rel <= 0)
in0_50 = (rel > 0) & (rel <= 50)
in50_100 = (rel > 50) & (rel <= 100)
ab100 = rel > 100
mean = round(rel.mean(), 3)
median = round(rel.median(), 3)
std = round(rel.std(), 3)

print("less than - 150% : ", lessneg150.sum()) # 1655
print("in between -150% and -100% : ", neg150_100.sum()) # 517 
print("in between -100% and -50% : ", neg100_50.sum()) # 623
print("in between -50% and 0% : ", neg50_0.sum()) # 1404
print("in between 0% and 50% : ", in0_50.sum()) # 2045
print("in between 50% and 100% : ", in50_100.sum()) # 452 
print("above 100% : ", ab100.sum()) # 0
print("mean : ", mean) # -89.679
print("median : ", median) # -28.515
print("std : ", std) # 182.786

In [None]:
import pandas as pd

df = pd.read_csv('emiss_comb_cnn.csv')
df['abs_rel_error'] = abs(df['relative_error'])

rel = df['abs_rel_error']


above150 = (rel > 150)
in150_100 = (rel > 100) & (rel <= 150)
in100_50 = (rel > 50) & (rel <= 100)
in50_20 = (rel > 20) & (rel <= 50)
in20_0 = (rel > 0) & (rel <= 20)

mean = round(rel.mean(), 3)
median = round(rel.median(), 3)
std = round(rel.std(), 3)

print("above 150% : ", above150.sum()) # 1655
print("in between 100% and 150% : ", in150_100.sum()) # 517
print("in between 50% and 100% : ", in100_50.sum()) # 1075
print("in between 20% and 50% : ", in50_20.sum()) # 1611
print("in between 0% and 20% : ", in20_0.sum()) # 1838
print("mean : ", mean) # 124.565
print("median : ", median) # 46.256
print("std : ", std) # 204.702

In [None]:
import pandas as pd

df = pd.read_csv('emiss_comb_cnn.csv')

true = df['True Emissions']
min_true = round(true.min(), 3)
max_true = round(true.max(), 3)
mean_true = round(true.mean(), 3)
std_true = round(true.std(), 3)
median_true = round(true.median(), 3)
range_true = round(max_true-min_true, 3)

pred = df['Average Emiss']
min_pred = round(pred.min(), 3)
max_pred = round(pred.max(), 3)
mean_pred = round(pred.mean(), 3)
std_pred = round(pred.std(), 3)
median_pred = round(pred.median(), 3)
range_pred = round(max_pred-min_pred, 3)

print("True statistics : ")
print()
print("min : ", min_true) # 0.065
print("max : ", max_true) # 30.114
print("mean : ", mean_true) # 7.221
print("std : ", std_true) # 7.931
print("median : ", median_true) # 2.042
print("range : ", range_true) # 30.049
print()
print("Predicted statistics : ")
print()
print("min : ", min_pred) # 1.308
print("max : ", max_pred) # 19.253
print("mean : ", mean_pred) # 6.62
print("std : ", std_pred) # 6.163
print("median : ", median_pred) # 2.199
print("range : ", range_pred) # 17.945

In [None]:
import numpy as np
import os
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, LeakyReLU, Dropout, BatchNormalization, MaxPool2D, Flatten, Dense
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau

def model_arch(input_shape):
    model = Sequential()
    model.add(Conv2D(32, (3, 3), activation = "elu", strides = 1, input_shape = input_shape))
    model.add(Dropout(0.1))
    model.add(Conv2D(32, (3, 3), activation = "elu", strides = 1))
    model.add(MaxPool2D(pool_size = (2, 2), padding = "valid", strides = 2))
    model.add(BatchNormalization())
    model.add(Conv2D(32, (3, 3), activation = "elu", strides = 1))
    model.add(Dropout(0.2))
    model.add(Conv2D(64, (3, 3), activation = "elu", strides = 1))
    model.add(BatchNormalization())
    model.add(Conv2D(64, (3, 3), activation = "elu", strides = 1))
    model.add(Dropout(0.2))
    model.add(Conv2D(64, (3, 3), activation = "elu", strides = 1))
    model.add(MaxPool2D(pool_size = (2, 2), padding = "valid", strides = 2))
    model.add(Conv2D(64, (3, 3), activation = "elu", strides = 1))
    model.add(Dropout(0.2))
    model.add(Conv2D(64, (3, 3), activation = "elu", strides = 1))
    model.add(MaxPool2D(pool_size = (2, 2), padding = "valid", strides = 2))
    model.add(Flatten())
    model.add(Dense(1))
    model.add(LeakyReLU(alpha = 0.3))
    
    return model

def rmse(y_true, y_pred):
    mse = tf.reduce_mean(tf.square(y_true - y_pred))
    return tf.sqrt(mse)
    
train_inputs = np.load('train_inputs.npy')
train_outputs = np.load('train_outputs.npy')
val_inputs = np.load('valid_inputs.npy')
val_outputs = np.load('valid_outputs.npy')
test_inputs = np.load('test_inputs.npy')
test_outputs = np.load('test_outputs.npy')

checkpoint_dir = './saved_models/' 
os.makedirs(checkpoint_dir, exist_ok = True)

check = os.path.join(checkpoint_dir, 'cnn_reg_saudi.h5')
check_call = ModelCheckpoint(filepath = check, save_weights_only = True, verbose = 1)

data_augmentation = ImageDataGenerator(rotation_range = 180, width_shift_range = 0.0, height_shift_range = 0.0, shear_range = 90, zoom_range = 0.2, horizontal_flip = True, vertical_flip = True, fill_mode = 'nearest')

model = model_arch(input_shape = (64, 64, 4))
#model.load_weights('saved_models/cnn_reg_saudi.h5')
optimizer = Adam(learning_rate = 1e-3)
lr_check = ReduceLROnPlateau(monitor = 'val_loss', factor = 0.75, patience = 30, verbose = 1, min_delta = 5e-3, cooldown = 0, min_lr = 5e-6)
model.compile(optimizer = optimizer, loss = tf.keras.losses.Huber())
# tf.keras.losses.Huber()
model.summary() # Around 186000 params

history = model.fit(data_augmentation.flow(train_inputs, train_outputs, batch_size = 32, shuffle = True), epochs = 150, steps_per_epoch = len(train_inputs) // 32,
                    validation_data = (val_inputs, val_outputs), validation_batch_size = 32,
                    callbacks = [lr_check, check_call])

test_loss = model.evaluate(test_inputs, test_outputs, batch_size = None)
print(f'Test Loss: {test_loss}')

del train_inputs, train_outputs, val_inputs, val_outputs

predicted_emissions = []
true_emissions = []
for i in range(len(test_inputs)):
    inputs = np.expand_dims(test_inputs[i], axis = 0)
    prediction = model.predict(inputs)
    true_emissions.append(test_outputs[i])
    predicted_emissions.append(prediction)

del test_inputs, test_outputs

true_emissions = np.array(true_emissions)
predicted_emissions = np.array(predicted_emissions)
true_emissions = true_emissions.reshape(predicted_emissions.shape)

csv_file = 'emiss_saudi_cnn.csv'
if os.path.exists(csv_file):
    df = pd.read_csv(csv_file)
    
#df = pd.DataFrame({'True Emissions': true_emissions.flatten(), 'Predicted Emissions': predicted_emissions.flatten()})
df['Pred Emiss 2'] = predicted_emissions.flatten()
df.to_csv('emiss_saudi_cnn.csv', index = False)


In [None]:
import pandas as pd

df = pd.read_csv('emiss_saudi_cnn.csv')
# 
df['Average Emiss'] = ((df['Predicted Emissions'] + df['Pred Emiss 2'] + df['Pred Emiss 1'] + df['Pred Emiss 3'])/4).round(3) 

df['error'] = df['True Emissions'] - df['Average Emiss'] 

df['absolute_error'] = (abs(df['error'])).round(3)

df['relative_error'] = ((df['error'] / (df['True Emissions'] + 1e-15)) * 100).round(3)

df.to_csv('emiss_saudi_cnn.csv', index = False)

In [None]:
import pandas as pd

df = pd.read_csv('emiss_saudi_cnn.csv')

percentiles = df['absolute_error'].describe(percentiles = [0.25, 0.5, 0.75])
percentiles = percentiles.loc[['25%', '50%', '75%']].apply(abs) # 25% : 0.528, 50% : 0.951, 75% : 1.328 (All are in Mt/yr)
print(percentiles)

In [None]:
import pandas as pd

df = pd.read_csv('emiss_saudi_cnn.csv')
df['abs_rel_error'] = abs(df['relative_error'])

percentiles = df['abs_rel_error'].describe(percentiles = [0.25, 0.5, 0.75])
percentiles = percentiles.loc[['25%', '50%', '75%']].apply(abs) # 25% : 33.114, 50% : 78.222, 75% : 154.050 (Unitless)
print(percentiles)


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

df = pd.read_csv('emiss_saudi_cnn.csv')

plt.figure(figsize = (10, 6))
sns.kdeplot(df['absolute_error'], shade = True, color = 'b')
plt.title('Kernel Density Estimate of Absolute Error')
plt.xlabel('Absolute Error')
plt.ylabel('Density')
plt.xlim(0, 25)
plt.savefig('Figures/Intern/absolute_error_plot_saudi.png', dpi = 300, bbox_inches = 'tight')
plt.show()

plt.figure(figsize = (10, 6))
sns.kdeplot(df['relative_error'], shade = True, color = 'b')
plt.title('Kernel Density Estimate of Relative Error')
plt.xlabel('Relative Error')
plt.ylabel('Density')
plt.xlim(-180, 180)
plt.savefig('Figures/Intern/relative_error_plot_saudi.png', dpi = 300, bbox_inches = 'tight')
plt.show()

In [None]:
import pandas as pd

df = pd.read_csv('emiss_saudi_cnn.csv')

abs = df['absolute_error']

in0_2 = (abs >= 0) & (abs <= 2)
in2_5 = (abs > 2) & (abs <= 5)
in5_10 = (abs > 5) & (abs <= 10)
ab10 = abs > 10
mean = round(abs.mean(), 3)
median = round(abs.median(), 3)
std = round(abs.std(), 3)

print("in between 0 and 2 : ", in0_2.sum()) # 3861
print("in between 2 and 5 : ", in2_5.sum()) # 212
print("in between 5 and 10 : ", in5_10.sum()) # 232
print("above 10 : ", ab10.sum()) # 87
print("mean : ", mean) # 1.514
print("median : ", median) # 0.951
print("std : ", std) # 2.252

In [None]:
import pandas as pd

df = pd.read_csv('emiss_saudi_cnn.csv')

rel = df['relative_error']

lessneg150 = (rel <= -150)
neg150_100 = (rel > -150) & (rel <= -100)
neg100_50 = (rel > -100) & (rel <= -50)
neg50_0 = (rel > -50) & (rel <= 0)
in0_50 = (rel > 0) & (rel <= 50)
in50_100 = (rel > 50) & (rel <= 100)
ab100 = rel > 100
mean = round(rel.mean(), 3)
median = round(rel.median(), 3)
std = round(rel.std(), 3)

print("less than - 150% : ", lessneg150.sum()) # 1137
print("in between -150% and -100% : ", neg150_100.sum()) # 521
print("in between -100% and -50% : ", neg100_50.sum()) # 656
print("in between -50% and 0% : ", neg50_0.sum()) # 774
print("in between 0% and 50% : ", in0_50.sum()) # 748
print("in between 50% and 100% : ", in50_100.sum()) # 556
print("above 100% : ", ab100.sum()) # 0
print("mean : ", mean) # -108.167
print("median : ", median) # -58.006
print("std : ", std) # 199.331

In [None]:
import pandas as pd

df = pd.read_csv('emiss_saudi_cnn.csv')
df['abs_rel_error'] = abs(df['relative_error'])

rel = df['abs_rel_error']


above150 = (rel > 150)
in150_100 = (rel > 100) & (rel <= 150)
in100_50 = (rel > 50) & (rel <= 100)
in50_20 = (rel > 20) & (rel <= 50)
in20_0 = (rel > 0) & (rel <= 20)

mean = round(rel.mean(), 3)
median = round(rel.median(), 3)
std = round(rel.std(), 3)

print("above 150% : ", above150.sum()) # 1137
print("in between 100% and 150% : ", in150_100.sum()) # 521
print("in between 50% and 100% : ", in100_50.sum()) # 1212
print("in between 20% and 50% : ", in50_20.sum()) # 798
print("in between 0% and 20% : ", in20_0.sum()) # 724
print("mean : ", mean) # 134.196
print("median : ", median) # 78.222
print("std : ", std) # 182.819

In [None]:
import pandas as pd

df = pd.read_csv('emiss_saudi_cnn.csv')

true = df['True Emissions']
min_true = round(true.min(), 3)
max_true = round(true.max(), 3)
mean_true = round(true.mean(), 3)
std_true = round(true.std(), 3)
median_true = round(true.median(), 3)
range_true = round(max_true-min_true, 3)

pred = df['Average Emiss']
min_pred = round(pred.min(), 3)
max_pred = round(pred.max(), 3)
mean_pred = round(pred.mean(), 3)
std_pred = round(pred.std(), 3)
median_pred = round(pred.median(), 3)
range_pred = round(max_pred-min_pred, 3)

print("True statistics : ")
print()
print("min : ", min_true) # 0.065
print("max : ", max_true) # 22.805
print("mean : ", mean_true) # 2.043
print("std : ", std_true) # 2.7
print("median : ", median_true) # 1.118
print("range : ", range_true) # 22.74
print()
print("Predicted statistics : ")
print()
print("min : ", min_pred) # 0.368
print("max : ", max_pred) # 11.53
print("mean : ", mean_pred) # 2.524
print("std : ", std_pred) # 1.752
print("median : ", median_pred) # 1.89
print("range : ", range_pred) # 11.162

U-Net Regression

In [None]:
import numpy as np
import os
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Conv2D, LeakyReLU, Dropout, BatchNormalization, MaxPool2D, Flatten, Dense, Conv2DTranspose, UpSampling2D, Concatenate
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau

def unetreg(input_shape, dropout_rate = 0.2):
    inputs = tf.keras.Input(input_shape)
    
    c1 = Conv2D(64, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_uniform')(inputs) # Shape : (64, 64, 64)
    c1 = BatchNormalization()(c1)
    c1 = Conv2D(64, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_uniform')(c1) # Shape : (64, 64, 64)
    c1 = BatchNormalization()(c1)
    d1 = Dropout(dropout_rate)(c1) 
    p1 = MaxPool2D(pool_size = (2, 2))(d1) # Shape : (32, 32, 64) 
    
    c2 = Conv2D(128, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_uniform')(p1) # Shape : (32, 32, 128)
    c2 = BatchNormalization()(c2)
    c2 = Conv2D(128, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_uniform')(c2) # Shape : (32, 32, 128)
    c2 = BatchNormalization()(c2)
    d2 = Dropout(dropout_rate)(c2)
    p2 = MaxPool2D(pool_size = (2, 2))(d2) # Shape : (16, 16, 128)
    
    c3 = Conv2D(256, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_uniform')(p2) # Shape : (16, 16, 256)
    c3 = BatchNormalization()(c3)
    c3 = Conv2D(256, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_uniform')(c3) # Shape : (16, 16, 256)
    c3 = BatchNormalization()(c3)
    d3 = Dropout(dropout_rate)(c3)
    p3 = MaxPool2D(pool_size = (2, 2))(d3) # Shape : (8, 8, 256)

    # Bottleneck
    c4 = Conv2D(512, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_uniform')(p3) # Shape : (8, 8, 512)
    c4 = BatchNormalization()(c4)
    c4 = Conv2D(512, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_uniform')(c4) # Shape : (8, 8, 512)
    c4 = BatchNormalization()(c4)
    d4 = Dropout(dropout_rate)(c4)
    
    # Decoder (upsampling path)
    u5 = Conv2DTranspose(256, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_uniform')(d4) # Shape : (8, 8, 256)
    u5 = UpSampling2D((2, 2))(u5) # Shape : (16, 16, 256)
    concat5 = Concatenate()([u5, c3]) # Shape : (16, 16, 256)
    c5 = Conv2D(256, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_uniform')(concat5) # Shape : (16, 16, 256)
    c5 = BatchNormalization()(c5)
    c5 = Conv2D(256, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_uniform')(c5) # Shape : (16, 16, 256)
    c5 = BatchNormalization()(c5)
    d5 = Dropout(dropout_rate)(c5)
    
    u6 = Conv2DTranspose(128, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_uniform')(d5) # Shape : (16, 16, 128)
    u6 = UpSampling2D((2, 2))(u6) # Shape : (32, 32, 128)
    concat6 = Concatenate()([u6, c2]) # Shape : (32, 32, 128)
    c6 = Conv2D(128, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_uniform')(concat6) # Shape : (32, 32, 128)
    c6 = BatchNormalization()(c6)
    c6 = Conv2D(128, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_uniform')(c6) # Shape : (32, 32, 128)
    c6 = BatchNormalization()(c6)
    d6 = Dropout(dropout_rate)(c6)
    
    u7 = Conv2DTranspose(64, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_uniform')(d6) # Shape : (32, 32, 64)
    u7 = UpSampling2D((2, 2))(u7) # Shape : (64, 64, 64)
    concat7 = Concatenate()([u7, c1])  # Shape : (64, 64, 64)
    c7 = Conv2D(64, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_uniform')(concat7) # Shape : (64, 64, 64)
    c7 = BatchNormalization()(c7)
    c7 = Conv2D(1, 3, activation = 'relu', padding = 'same', kernel_initializer = 'glorot_uniform')(c7) # Shape : (64, 64, 1)
    c7 = BatchNormalization()(c7)
    
    # Output layer
    flat = Flatten()(c7) # Shape : 64*64*1
    output = Dense(1, activation = 'linear')(flat) # Shape : 1
    
    model = Model(inputs = inputs, outputs = output)
    return model
    
def rmse(y_true, y_pred):
    mse = tf.reduce_mean(tf.square(y_true - y_pred))
    return tf.sqrt(mse)
    
train_inputs = np.load('train_comb_inputs.npy')
train_outputs = np.load('train_comb_outputs.npy')
val_inputs = np.load('valid_comb_inputs.npy')
val_outputs = np.load('valid_comb_outputs.npy')
test_inputs = np.load('test_comb_inputs.npy')
test_outputs = np.load('test_comb_outputs.npy')

checkpoint_dir = './saved_models/' 
os.makedirs(checkpoint_dir, exist_ok = True)

check = os.path.join(checkpoint_dir, 'unet_reg_comb.h5')
check_call = ModelCheckpoint(filepath = check, save_weights_only = True, verbose = 1)

data_augmentation = ImageDataGenerator(rotation_range = 180, width_shift_range = 0.0, height_shift_range = 0.0, shear_range = 90, zoom_range = 0.2, horizontal_flip = True, vertical_flip = True, fill_mode = 'nearest')

model = unetreg(input_shape = (64, 64, 4))
#model.load_weights('saved_models/unet_reg_comb.h5')
optimizer = Adam(learning_rate = 1e-3)
lr_check = ReduceLROnPlateau(monitor = 'val_loss', factor = 0.75, patience = 20, verbose = 1, min_delta = 5e-3, cooldown = 0, min_lr = 5e-6)
model.compile(optimizer = optimizer, loss = tf.keras.losses.Huber())
# tf.keras.losses.Huber()
model.summary() # Around 8 million params

history = model.fit(data_augmentation.flow(train_inputs, train_outputs, batch_size = 32, shuffle = True), epochs = 150, steps_per_epoch = len(train_inputs) // 32,
                    validation_data = (val_inputs, val_outputs), validation_batch_size = 32,
                    callbacks = [lr_check, check_call])

test_loss = model.evaluate(test_inputs, test_outputs, batch_size = None)
print(f'Test Loss: {test_loss}')

del train_inputs, train_outputs, val_inputs, val_outputs

predicted_emissions = []
true_emissions = []
for i in range(len(test_inputs)):
    inputs = np.expand_dims(test_inputs[i], axis = 0)
    prediction = model.predict(inputs)
    true_emissions.append(test_outputs[i])
    predicted_emissions.append(prediction)

del test_inputs, test_outputs

true_emissions = np.array(true_emissions)
predicted_emissions = np.array(predicted_emissions)
true_emissions = true_emissions.reshape(predicted_emissions.shape)

csv_file = 'emiss_comb_unet.csv'
if os.path.exists(csv_file):
    df = pd.read_csv(csv_file)
    
df = pd.DataFrame({'True Emissions': true_emissions.flatten(), 'Predicted Emissions': predicted_emissions.flatten()})
#df['Pred Emiss 2'] = predicted_emissions.flatten()
df.to_csv('emiss_comb_unet.csv', index = False)

In [None]:
import pandas as pd

df = pd.read_csv('emiss_comb_unet.csv')
# 
df['Average Emiss'] = ((df['Pred Emiss 2'] + df['Pred Emiss 1'] + df['Predicted Emissions'] + df['Pred Emiss 3'] )/4).round(3) 

df['error'] = df['True Emissions'] - df['Average Emiss'] 

df['absolute_error'] = (abs(df['error'])).round(3)

df['relative_error'] = ((df['error'] / (df['True Emissions'] + 1e-15)) * 100).round(3)

df.to_csv('emiss_comb_unet.csv', index = False)

In [None]:
import pandas as pd

df = pd.read_csv('emiss_comb_unet.csv')

percentiles = df['absolute_error'].describe(percentiles = [0.25, 0.5, 0.75])
percentiles = percentiles.loc[['25%', '50%', '75%']].apply(abs) # 25% : 0.484, 50% : 1.069, 75% : 2.786 (All are in Mt/yr)
print(percentiles)

In [None]:
import pandas as pd

df = pd.read_csv('emiss_comb_unet.csv')
df['abs_rel_error'] = abs(df['relative_error'])

percentiles = df['abs_rel_error'].describe(percentiles = [0.25, 0.5, 0.75])
percentiles = percentiles.loc[['25%', '50%', '75%']].apply(abs) # 25% : 14.921, 50% : 37.68, 75% : 81.439 (Unitless)
print(percentiles)


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

df = pd.read_csv('emiss_comb_unet.csv')

plt.figure(figsize = (10, 6))
sns.kdeplot(df['absolute_error'], shade = True, color = 'b')
plt.title('Kernel Density Estimate of Absolute Error')
plt.xlabel('Absolute Error')
plt.ylabel('Density')
plt.xlim(0, 25)
plt.savefig('Figures/Intern/absolute_error_plot_unet_comb.png', dpi = 300, bbox_inches = 'tight')
plt.show()

plt.figure(figsize = (10, 6))
sns.kdeplot(df['relative_error'], shade = True, color = 'b')
plt.title('Kernel Density Estimate of Relative Error')
plt.xlabel('Relative Error')
plt.ylabel('Density')
plt.xlim(-180, 180)
plt.savefig('Figures/Intern/relative_error_plot_unet_comb.png', dpi = 300, bbox_inches = 'tight')
plt.show()

In [None]:
import pandas as pd

df = pd.read_csv('emiss_comb_unet.csv')

abs = df['absolute_error']

in0_2 = (abs >= 0) & (abs <= 2)
in2_5 = (abs > 2) & (abs <= 5)
in5_10 = (abs > 5) & (abs <= 10)
ab10 = abs > 10
mean = round(abs.mean(), 3)
median = round(abs.median(), 3)
std = round(abs.std(), 3)

print("in between 0 and 2 : ", in0_2.sum()) # 4546
print("in between 2 and 5 : ", in2_5.sum()) # 1310
print("in between 5 and 10 : ", in5_10.sum()) # 748
print("above 10 : ", ab10.sum()) # 92
print("mean : ", mean) # 2.076
print("median : ", median) # 1.069
print("std : ", std) # 2.426

In [None]:
import pandas as pd

df = pd.read_csv('emiss_comb_unet.csv')

rel = df['relative_error']

lessneg150 = (rel <= -150)
neg150_100 = (rel > -150) & (rel <= -100)
neg100_50 = (rel > -100) & (rel <= -50)
neg50_0 = (rel > -50) & (rel <= 0)
in0_50 = (rel > 0) & (rel <= 50)
in50_100 = (rel > 50) & (rel <= 100)
ab100 = rel > 100
mean = round(rel.mean(), 3)
median = round(rel.median(), 3)
std = round(rel.std(), 3)

print("less than - 150% : ", lessneg150.sum()) # 937
print("in between -150% and -100% : ", neg150_100.sum()) # 406
print("in between -100% and -50% : ", neg100_50.sum()) # 854
print("in between -50% and 0% : ", neg50_0.sum()) # 2275
print("in between 0% and 50% : ", in0_50.sum()) # 1666
print("in between 50% and 100% : ", in50_100.sum()) # 558
print("above 100% : ", ab100.sum()) # 0
print("mean : ", mean) # -63.52
print("median : ", median) # -18.568
print("std : ", std) # 172.123

In [None]:
import pandas as pd

df = pd.read_csv('emiss_comb_unet.csv')
df['abs_rel_error'] = abs(df['relative_error'])

rel = df['abs_rel_error']


above150 = (rel > 150)
in150_100 = (rel > 100) & (rel <= 150)
in100_50 = (rel > 50) & (rel <= 100)
in50_20 = (rel > 20) & (rel <= 50)
in20_0 = (rel > 0) & (rel <= 20)

mean = round(rel.mean(), 3)
median = round(rel.median(), 3)
std = round(rel.std(), 3)

print("above 150% : ", above150.sum()) # 937
print("in between 100% and 150% : ", in150_100.sum()) # 406
print("in between 50% and 100% : ", in100_50.sum()) # 1412
print("in between 20% and 50% : ", in50_20.sum()) # 1755
print("in between 0% and 20% : ", in20_0.sum()) # 2186
print("mean : ", mean) # 84.385
print("median : ", median) # 37.68
print("std : ", std) # 162.911

In [None]:
import pandas as pd

df = pd.read_csv('emiss_comb_unet.csv')

true = df['True Emissions']
min_true = round(true.min(), 3)
max_true = round(true.max(), 3)
mean_true = round(true.mean(), 3)
std_true = round(true.std(), 3)
median_true = round(true.median(), 3)
range_true = round(max_true-min_true, 3)

pred = df['Average Emiss']
min_pred = round(pred.min(), 3)
max_pred = round(pred.max(), 3)
mean_pred = round(pred.mean(), 3)
std_pred = round(pred.std(), 3)
median_pred = round(pred.median(), 3)
range_pred = round(max_pred-min_pred, 3)

print("True statistics : ")
print()
print("min : ", min_true) # 0.065
print("max : ", max_true) # 30.114
print("mean : ", mean_true) # 7.221
print("std : ", std_true) # 7.931
print("median : ", median_true) # 2.042
print("range : ", range_true) # 30.049
print()
print("Predicted statistics : ")
print()
print("min : ", min_pred) # 0.265
print("max : ", max_pred) # 26.135
print("mean : ", mean_pred) # 7.689
print("std : ", std_pred) # 8.304
print("median : ", median_pred) # 2.07
print("range : ", range_pred) # 25.87

In [None]:
import numpy as np
import os
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Conv2D, LeakyReLU, Dropout, BatchNormalization, MaxPool2D, Flatten, Dense, Conv2DTranspose, UpSampling2D, Concatenate
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau

def unetreg(input_shape, dropout_rate = 0.2):
    inputs = tf.keras.Input(input_shape)
    
    c1 = Conv2D(64, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_uniform')(inputs) # Shape : (64, 64, 64)
    c1 = BatchNormalization()(c1)
    c1 = Conv2D(64, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_uniform')(c1) # Shape : (64, 64, 64)
    c1 = BatchNormalization()(c1)
    d1 = Dropout(dropout_rate)(c1) 
    p1 = MaxPool2D(pool_size = (2, 2))(d1) # Shape : (32, 32, 64) 
    
    c2 = Conv2D(128, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_uniform')(p1) # Shape : (32, 32, 128)
    c2 = BatchNormalization()(c2)
    c2 = Conv2D(128, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_uniform')(c2) # Shape : (32, 32, 128)
    c2 = BatchNormalization()(c2)
    d2 = Dropout(dropout_rate)(c2)
    p2 = MaxPool2D(pool_size = (2, 2))(d2) # Shape : (16, 16, 128)
    
    c3 = Conv2D(256, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_uniform')(p2) # Shape : (16, 16, 256)
    c3 = BatchNormalization()(c3)
    c3 = Conv2D(256, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_uniform')(c3) # Shape : (16, 16, 256)
    c3 = BatchNormalization()(c3)
    d3 = Dropout(dropout_rate)(c3)
    p3 = MaxPool2D(pool_size = (2, 2))(d3) # Shape : (8, 8, 256)

    # Bottleneck
    c4 = Conv2D(512, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_uniform')(p3) # Shape : (8, 8, 512)
    c4 = BatchNormalization()(c4)
    c4 = Conv2D(512, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_uniform')(c4) # Shape : (8, 8, 512)
    c4 = BatchNormalization()(c4)
    d4 = Dropout(dropout_rate)(c4)
    
    # Decoder (upsampling path)
    u5 = Conv2DTranspose(256, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_uniform')(d4) # Shape : (8, 8, 256)
    u5 = UpSampling2D((2, 2))(u5) # Shape : (16, 16, 256)
    concat5 = Concatenate()([u5, c3]) # Shape : (16, 16, 256)
    c5 = Conv2D(256, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_uniform')(concat5) # Shape : (16, 16, 256)
    c5 = BatchNormalization()(c5)
    c5 = Conv2D(256, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_uniform')(c5) # Shape : (16, 16, 256)
    c5 = BatchNormalization()(c5)
    d5 = Dropout(dropout_rate)(c5)
    
    u6 = Conv2DTranspose(128, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_uniform')(d5) # Shape : (16, 16, 128)
    u6 = UpSampling2D((2, 2))(u6) # Shape : (32, 32, 128)
    concat6 = Concatenate()([u6, c2]) # Shape : (32, 32, 128)
    c6 = Conv2D(128, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_uniform')(concat6) # Shape : (32, 32, 128)
    c6 = BatchNormalization()(c6)
    c6 = Conv2D(128, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_uniform')(c6) # Shape : (32, 32, 128)
    c6 = BatchNormalization()(c6)
    d6 = Dropout(dropout_rate)(c6)
    
    u7 = Conv2DTranspose(64, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_uniform')(d6) # Shape : (32, 32, 64)
    u7 = UpSampling2D((2, 2))(u7) # Shape : (64, 64, 64)
    concat7 = Concatenate()([u7, c1])  # Shape : (64, 64, 64)
    c7 = Conv2D(64, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_uniform')(concat7) # Shape : (64, 64, 64)
    c7 = BatchNormalization()(c7)
    c7 = Conv2D(1, 3, activation = 'relu', padding = 'same', kernel_initializer = 'glorot_uniform')(c7) # Shape : (64, 64, 1)
    c7 = BatchNormalization()(c7)
    
    # Output layer
    flat = Flatten()(c7) # Shape : 64*64*1
    output = Dense(1, activation = 'linear')(flat) # Shape : 1
    
    model = Model(inputs = inputs, outputs = output)
    return model
    
def rmse(y_true, y_pred):
    mse = tf.reduce_mean(tf.square(y_true - y_pred))
    return tf.sqrt(mse)
    
train_inputs = np.load('train_inputs.npy')
train_outputs = np.load('train_outputs.npy')
val_inputs = np.load('valid_inputs.npy')
val_outputs = np.load('valid_outputs.npy')
test_inputs = np.load('test_inputs.npy')
test_outputs = np.load('test_outputs.npy')

checkpoint_dir = './saved_models/' 
os.makedirs(checkpoint_dir, exist_ok = True)

check = os.path.join(checkpoint_dir, 'unet_reg_saudi.h5')
check_call = ModelCheckpoint(filepath = check, save_weights_only = True, verbose = 1)

data_augmentation = ImageDataGenerator(rotation_range = 180, width_shift_range = 0.0, height_shift_range = 0.0, shear_range = 90, zoom_range = 0.2, horizontal_flip = True, vertical_flip = True, fill_mode = 'nearest')

model = unetreg(input_shape = (64, 64, 4))
# model.load_weights('saved_models/unet_reg_saudi.h5')
optimizer = Adam(learning_rate = 1e-3)
lr_check = ReduceLROnPlateau(monitor = 'val_loss', factor = 0.75, patience = 30, verbose = 1, min_delta = 5e-3, cooldown = 0, min_lr = 5e-6)
model.compile(optimizer = optimizer, loss = 'mse')
# tf.keras.losses.Huber()
model.summary() # Around 5 million params

history = model.fit(data_augmentation.flow(train_inputs, train_outputs, batch_size = 32, shuffle = True), epochs = 150, steps_per_epoch = len(train_inputs) // 32,
                    validation_data = (val_inputs, val_outputs), validation_batch_size = 32,
                    callbacks = [lr_check, check_call])

test_loss = model.evaluate(test_inputs, test_outputs, batch_size = None)
print(f'Test Loss: {test_loss}')

del train_inputs, train_outputs, val_inputs, val_outputs

predicted_emissions = []
true_emissions = []
for i in range(len(test_inputs)):
    inputs = np.expand_dims(test_inputs[i], axis = 0)
    prediction = model.predict(inputs)
    true_emissions.append(test_outputs[i])
    predicted_emissions.append(prediction)

del test_inputs, test_outputs

true_emissions = np.array(true_emissions)
predicted_emissions = np.array(predicted_emissions)
true_emissions = true_emissions.reshape(predicted_emissions.shape)

#csv_file = 'emiss_saudi_unet.csv'
#if os.path.exists(csv_file):
#    df = pd.read_csv(csv_file)
    
df = pd.DataFrame({'True Emissions': true_emissions.flatten(), 'Predicted Emissions': predicted_emissions.flatten()})
#df['Pred Emiss 2'] = predicted_emissions.flatten()
df.to_csv('emiss_saudi_unet.csv', index = False)

In [None]:
import pandas as pd

df = pd.read_csv('emiss_saudi_unet.csv')
#  + df['Pred Emiss 3']
df['Average Emiss'] = ((df['Predicted Emissions'] + df['Pred Emiss 2'] + df['Pred Emiss 1'])/3).round(3) 

df['error'] = df['True Emissions'] - df['Average Emiss'] 

df['absolute_error'] = (abs(df['error'])).round(3)

df['relative_error'] = ((df['error'] / (df['True Emissions'] + 1e-15)) * 100).round(3)

df.to_csv('emiss_saudi_unet.csv', index = False)

In [None]:
import pandas as pd

df = pd.read_csv('emiss_saudi_unet.csv')

percentiles = df['absolute_error'].describe(percentiles = [0.25, 0.5, 0.75])
percentiles = percentiles.loc[['25%', '50%', '75%']].apply(abs) # 25% : 0.264, 50% : 0.568, 75% : 1.005 (All are in Mt/yr)
print(percentiles)

In [None]:
import pandas as pd

df = pd.read_csv('emiss_saudi_unet.csv')
df['abs_rel_error'] = abs(df['relative_error'])

percentiles = df['abs_rel_error'].describe(percentiles = [0.25, 0.5, 0.75])
percentiles = percentiles.loc[['25%', '50%', '75%']].apply(abs) # 25% : 21.358, 50% : 46.738, 75% : 89.396 (Unitless)
print(percentiles)


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

df = pd.read_csv('emiss_saudi_unet.csv')

plt.figure(figsize = (10, 6))
sns.kdeplot(df['absolute_error'], shade = True, color = 'b')
plt.title('Kernel Density Estimate of Absolute Error')
plt.xlabel('Absolute Error')
plt.ylabel('Density')
plt.xlim(0, 25)
plt.savefig('Figures/Intern/absolute_error_plot_unet_saudi.png', dpi = 300, bbox_inches = 'tight')
plt.show()

plt.figure(figsize = (10, 6))
sns.kdeplot(df['relative_error'], shade = True, color = 'b')
plt.title('Kernel Density Estimate of Relative Error')
plt.xlabel('Relative Error')
plt.ylabel('Density')
plt.xlim(-180, 180)
plt.savefig('Figures/Intern/relative_error_plot_unet_saudi.png', dpi = 300, bbox_inches = 'tight')
plt.show()

In [None]:
import pandas as pd

df = pd.read_csv('emiss_saudi_unet.csv')

abs = df['absolute_error']

in0_2 = (abs >= 0) & (abs <= 2)
in2_5 = (abs > 2) & (abs <= 5)
in5_10 = (abs > 5) & (abs <= 10)
ab10 = abs > 10
mean = round(abs.mean(), 3)
median = round(abs.median(), 3)
std = round(abs.std(), 3)

print("in between 0 and 2 : ", in0_2.sum()) # 3863
print("in between 2 and 5 : ", in2_5.sum()) # 243
print("in between 5 and 10 : ", in5_10.sum()) # 216
print("above 10 : ", ab10.sum()) # 70
print("mean : ", mean) # 1.22
print("median : ", median) # 0.568
print("std : ", std) # 2.153

In [None]:
import pandas as pd

df = pd.read_csv('emiss_saudi_unet.csv')

rel = df['relative_error']

lessneg150 = (rel <= -150)
neg150_100 = (rel > -150) & (rel <= -100)
neg100_50 = (rel > -100) & (rel <= -50)
neg50_0 = (rel > -50) & (rel <= 0)
in0_50 = (rel > 0) & (rel <= 50)
in50_100 = (rel > 50) & (rel <= 100)
ab100 = rel > 100
mean = round(rel.mean(), 3)
median = round(rel.median(), 3)
std = round(rel.std(), 3)

print("less than - 150% : ", lessneg150.sum()) # 631
print("in between -150% and -100% : ", neg150_100.sum()) # 350
print("in between -100% and -50% : ", neg100_50.sum()) # 541
print("in between -50% and 0% : ", neg50_0.sum()) # 964
print("in between 0% and 50% : ", in0_50.sum()) # 1333
print("in between 50% and 100% : ", in50_100.sum()) # 573
print("above 100% : ", ab100.sum()) # 0
print("mean : ", mean) # -48.342
print("median : ", median) # -11.966
print("std : ", std) # 126.753

In [None]:
import pandas as pd

df = pd.read_csv('emiss_saudi_unet.csv')
df['abs_rel_error'] = abs(df['relative_error'])

rel = df['abs_rel_error']


above150 = (rel > 150)
in150_100 = (rel > 100) & (rel <= 150)
in100_50 = (rel > 50) & (rel <= 100)
in50_20 = (rel > 20) & (rel <= 50)
in20_0 = (rel > 0) & (rel <= 20)

mean = round(rel.mean(), 3)
median = round(rel.median(), 3)
std = round(rel.std(), 3)

print("above 150% : ", above150.sum()) # 631
print("in between 100% and 150% : ", in150_100.sum()) # 350
print("in between 50% and 100% : ", in100_50.sum()) # 1114
print("in between 20% and 50% : ", in50_20.sum()) # 1276
print("in between 0% and 20% : ", in20_0.sum()) # 1021
print("mean : ", mean) # 81.15
print("median : ", median) # 46.738
print("std : ", std) # 108.705

In [None]:
import pandas as pd

df = pd.read_csv('emiss_saudi_unet.csv')

true = df['True Emissions']
min_true = round(true.min(), 3)
max_true = round(true.max(), 3)
mean_true = round(true.mean(), 3)
std_true = round(true.std(), 3)
median_true = round(true.median(), 3)
range_true = round(max_true-min_true, 3)

pred = df['Average Emiss']
min_pred = round(pred.min(), 3)
max_pred = round(pred.max(), 3)
mean_pred = round(pred.mean(), 3)
std_pred = round(pred.std(), 3)
median_pred = round(pred.median(), 3)
range_pred = round(max_pred-min_pred, 3)

print("True statistics : ")
print()
print("min : ", min_true) # 0.065
print("max : ", max_true) # 22.805
print("mean : ", mean_true) # 2.043
print("std : ", std_true) # 2.7
print("median : ", median_true) # 1.118
print("range : ", range_true) # 22.74
print()
print("Predicted statistics : ")
print()
print("min : ", min_pred) # 0.37
print("max : ", max_pred) # 13.024
print("mean : ", mean_pred) # 1.503
print("std : ", std_pred) # 0.889
print("median : ", median_pred) # 1.323
print("range : ", range_pred) # 12.654



In [None]:
# importing necessary packages
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

df1 = pd.read_csv('emiss_comb_cnn.csv') # Reading emiss_results
df2 = pd.read_csv('emiss_saudi_cnn.csv') # Reading emiss_data
df3 = pd.read_csv('emiss_comb_unet.csv')
df4 = pd.read_csv('emiss_saudi_unet.csv')

plt.figure(figsize = (10, 6))
sns.kdeplot(df1['absolute_error'], fill = True, color = 'b', label = 'Using CNN regression on Combined dataset') # Using seaborn's kdeplot.
sns.kdeplot(df2['absolute_error'], fill = True, color = 'r', label = 'Using CNN regression on curated dataset')
sns.kdeplot(df3['absolute_error'], fill = True, color = 'g', label = 'Using CNN regression on original dataset')
sns.kdeplot(df4['absolute_error'], fill = True, color = 'yellow', label = 'Using U-Net regression on original dataset')
plt.title('Kernel Density Estimate of Absolute Error for Lippendorf')
plt.xlabel('Absolute Error')
plt.ylabel('Density')
plt.legend()
plt.xlim(0, 25)
plt.savefig('absolute_error_comb.png', dpi = 300, bbox_inches = 'tight') # Saving the file
plt.show()

plt.figure(figsize = (10, 6))
sns.kdeplot(df1['relative_error'], fill = True, color = 'b', label = 'Using U-Net regression on curated dataset') # Using seaborn's kdeplot.
sns.kdeplot(df2['relative_error'], fill = True, color = 'r', label = 'Using CNN regression on curated dataset')
sns.kdeplot(df3['relative_error'], fill = True, color = 'g', label = 'Using CNN regression on original dataset')
sns.kdeplot(df4['relative_error'], fill = True, color = 'yellow', label = 'Using U-Net regression on original dataset')
plt.title('Kernel Density Estimate of Relative Error (%) for Lippendorf')
plt.xlabel('Relative Error (%)')
plt.ylabel('Density')
plt.legend()
plt.xlim(-180, 180)
plt.savefig('relative_error_comb.png', dpi = 300, bbox_inches = 'tight') # Saving the file
plt.show()

In [None]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# ANNUAL data (Y)
Y = np.array([1.93, 0.95, 1.72, 1.34, 0.81, 0.28, 1.01, 3.71, 9.89, 0.86, 1.34, 0.68], dtype = np.float32).reshape(-1, 1)

# ANNUAL PRED UNET data (X)
X = np.array([1.42, 0.83, 4.60, 2.37, 1.73, 1.76, 2.15, 6.63, 6.90, 1.58, 1.67, 1.61], dtype = np.float32).reshape(-1, 1)

W = tf.Variable(tf.random.normal([1]), dtype = np.float32, name = 'weight')
b = tf.Variable(tf.random.normal([1]), dtype = np.float32, name = 'bias')

def linear_regression(X):
    return W * X + b

def mean_squared_error(y_true, y_pred):
    return tf.reduce_mean(tf.square(y_true - y_pred))

optimizer = tf.optimizers.Adam(learning_rate = 0.01)

def train_step(X_batch, y_batch):
    with tf.GradientTape() as tape:
        y_pred = linear_regression(X_batch)
        loss = mean_squared_error(y_batch, y_pred)
    gradients = tape.gradient(loss, [W, b])
    optimizer.apply_gradients(zip(gradients, [W, b]))
    return loss

epochs = 5000
for epoch in range(epochs):
    loss = train_step(X, Y)
    if epoch % 100 == 0:
        print(f"Epoch {epoch}, Loss: {loss.numpy()}")

Y_hat = linear_regression(X).numpy()

error_X_Y = np.abs(X.flatten() - Y.flatten())
error_Y_Y_hat = np.abs(Y.flatten() - Y_hat.flatten())
mean_error_X_Y = np.mean(error_X_Y)
mean_error_Y_Y_hat = np.mean(error_Y_Y_hat)

df = pd.DataFrame({
    'X (Actual)': X.flatten(),
    'Y (Initial)': Y.flatten(),
    'Y_hat (Predicted)': Y_hat.flatten(),
    'Error X - Y': error_X_Y,
    'Error Y - Y_hat': error_Y_Y_hat
})

print("\nTable of X, Y, and Y_hat:")
print(df)

print(f"\nMean Error between X and Y: {mean_error_X_Y:.2f}")
print(f"Mean Error between Y and Y_hat: {mean_error_Y_Y_hat:.2f}")



In [None]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# ANNUAL data (Y)
Y = np.array([1.93, 0.95, 1.72, 1.34, 0.81, 0.28, 1.01, 3.71, 9.89, 0.86, 1.34, 0.68], dtype=np.float32).reshape(-1, 1)

# ANNUAL PRED UNET data (X)
X = np.array([1.42, 0.83, 4.60, 2.37, 1.73, 1.76, 2.15, 6.63, 6.90, 1.58, 1.67, 1.61], dtype=np.float32).reshape(-1, 1)

# Define the neural network model
model_nn = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(X.shape[1],)),  # Input layer
    tf.keras.layers.Dense(64, activation='relu'),  # First hidden layer
    tf.keras.layers.Dense(64, activation='relu'),  # Second hidden layer
    tf.keras.layers.Dense(1)  # Output layer
])

# Compile the model
model_nn.compile(optimizer='adam', loss='mean_squared_error')

# Train the model
history = model_nn.fit(X, Y, epochs = 5000, verbose = 0)

# Make predictions
Y_hat = model_nn.predict(X)

# Calculate errors
error_X_Y = np.abs(X.flatten() - Y.flatten())
error_Y_Y_hat = np.abs(Y.flatten() - Y_hat.flatten())
mean_error_X_Y = np.mean(error_X_Y)
mean_error_Y_Y_hat = np.mean(error_Y_Y_hat)

# Create DataFrame with errors
df = pd.DataFrame({
    'X (Actual)': X.flatten(),
    'Y (Initial)': Y.flatten(),
    'Y_hat (Predicted)': Y_hat.flatten(),
    'Absolute Error X - Y': error_X_Y,
    'Absolute Error Y - Y_hat': error_Y_Y_hat
})

# Print the DataFrame
print("\nTable of X, Y, Y_hat with Absolute Errors:")
print(df)

# Print mean absolute errors
print(f"\nMean Absolute Error between X and Y: {mean_error_X_Y:.2f}")
print(f"Mean Absolute Error between Y and Y_hat: {mean_error_Y_Y_hat:.2f}")




In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam


df = pd.read_csv('emiss_saudi_unet.csv')

X = df['Pred Emiss 2'].values.reshape(-1, 1)
Y = df['True Emissions'].values.reshape(-1, 1)

X_train, X_temp, Y_train, Y_temp = train_test_split(X, Y, test_size = 0.4, random_state = 42)
X_val, X_test, Y_val, Y_test = train_test_split(X_temp, Y_temp, test_size = 0.4, random_state = 42)

model = Sequential([
    Dense(64, activation = 'relu', input_shape = (X_train.shape[1],)),
    Dense(128, activation = 'relu'),
    Dense(64, activation = 'relu'),
    Dense(1)])

optimizer = Adam(learning_rate = 0.001)
model.compile(optimizer = optimizer, loss = 'mean_squared_error')

model.fit(X_train, Y_train, epochs = 100, validation_data = (X_val, Y_val), verbose = 0)

Y_pred = model.predict(X)

results_df = pd.DataFrame({'True Emissions': Y.flatten(), 'Predicted Emissions': Y_pred.flatten()})

results_df.to_csv('emiss_saudi_reg.csv', index = False)


In [None]:
import numpy as np
test_inputs = np.load('test_inputs.npy')
test_outputs = np.load('test_outputs.npy')

shape = test_outputs.shape
print(shape)

In [None]:
import numpy as np

train_inputs_box = np.load('train_inputs_box.npy')
train_inputs_lip = np.load('train_inputs_lip.npy')
valid_inputs_box = np.load('valid_inputs_box.npy')
valid_inputs_lip = np.load('valid_inputs_lip.npy')
test_inputs_box = np.load('test_inputs_box.npy')
test_inputs_lip = np.load('test_inputs_lip.npy')

train_outputs_box = np.load('train_outputs_box.npy')
train_outputs_lip = np.load('train_outputs_lip.npy')
valid_outputs_box = np.load('valid_outputs_box.npy')
valid_outputs_lip = np.load('valid_outputs_lip.npy')
test_outputs_box = np.load('test_outputs_box.npy')
test_outputs_lip = np.load('test_outputs_lip.npy')

tr_in = np.concatenate((train_inputs_box, train_inputs_lip), axis = 0)
va_in = np.concatenate((valid_inputs_box, valid_inputs_lip), axis = 0)
te_in = np.concatenate((test_inputs_box, test_inputs_lip), axis = 0)

tr_out = np.concatenate((train_outputs_box, train_outputs_lip), axis = 0)
va_out = np.concatenate((valid_outputs_box, valid_outputs_lip), axis = 0)
te_out = np.concatenate((test_outputs_box, test_outputs_lip), axis = 0)

np.save('tr_out.npy', tr_out)
np.save('va_out.npy', va_out)
np.save('te_out.npy', te_out)

np.save('tr_in.npy', tr_in)
np.save('va_in.npy', va_in)
np.save('te_in.npy', te_in)


In [None]:
import numpy as np
import os
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, LeakyReLU, Dropout, BatchNormalization, MaxPool2D, Flatten, Dense
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau

# (Number of samples, 64, 64, 4) xco2, no2, u, v
# emission rate co2
def model_arch(input_shape):
    model = Sequential()
    
    model.add(Conv2D(32, (3, 3), activation = "elu", strides = 1, input_shape = input_shape))
    model.add(Dropout(0.1))
    model.add(Conv2D(32, (3, 3), activation = "elu", strides = 1))
    model.add(MaxPool2D(pool_size = (2, 2), padding = "valid", strides = 2))
    model.add(BatchNormalization())
    model.add(Conv2D(32, (3, 3), activation = "elu", strides = 1))
    model.add(Dropout(0.2))
    model.add(Conv2D(64, (3, 3), activation = "elu", strides = 1))
    model.add(BatchNormalization())
    model.add(Conv2D(64, (3, 3), activation = "elu", strides = 1))
    model.add(Dropout(0.2))
    model.add(Conv2D(64, (3, 3), activation = "elu", strides = 1))
    model.add(MaxPool2D(pool_size = (2, 2), padding = "valid", strides = 2))
    model.add(Conv2D(64, (3, 3), activation = "elu", strides = 1))
    model.add(Dropout(0.2))
    model.add(Conv2D(64, (3, 3), activation = "elu", strides = 1))
    model.add(MaxPool2D(pool_size = (2, 2), padding = "valid", strides = 2))
    model.add(Flatten())
    model.add(Dense(1))
    model.add(LeakyReLU(alpha = 0.3))
    
    return model

def rmse(y_true, y_pred):
    mse = tf.reduce_mean(tf.square(y_true - y_pred))
    return tf.sqrt(mse)
    
train_inputs = np.load('tr_in.npy')
train_outputs = np.load('tr_out.npy')
val_inputs = np.load('va_in.npy')
val_outputs = np.load('va_out.npy')
test_inputs = np.load('te_in.npy')
test_outputs = np.load('te_out.npy')

checkpoint_dir = './saved_models/' 
os.makedirs(checkpoint_dir, exist_ok = True)

check = os.path.join(checkpoint_dir, 'cnn_reg_sim.h5')
check_call = ModelCheckpoint(filepath = check, save_weights_only = True, verbose = 1)

data_augmentation = ImageDataGenerator(rotation_range = 180, width_shift_range = 0.0, height_shift_range = 0.0, shear_range = 90, zoom_range = 0.2, horizontal_flip = True, vertical_flip = True, fill_mode = 'nearest')

model = model_arch(input_shape = (64, 64, 4))
# model.load_weights('saved_models/cnn_reg_sim.h5')
optimizer = Adam(learning_rate = 1e-3)
lr_check = ReduceLROnPlateau(monitor = 'val_loss', factor = 0.75, patience = 20, verbose = 1, min_delta = 5e-3, cooldown = 0, min_lr = 5e-6)
model.compile(optimizer = optimizer, loss = tf.keras.losses.Huber())
#
model.summary()

history = model.fit(data_augmentation.flow(train_inputs, train_outputs, batch_size = 32, shuffle = True), epochs = 200, steps_per_epoch = len(train_outputs) // 32,
                    validation_data = (val_inputs, val_outputs), validation_batch_size = 32,
                    callbacks = [lr_check, check_call])

del train_inputs, train_outputs, val_inputs, val_outputs

test_loss = model.evaluate(test_inputs, test_outputs, batch_size = None)
print(f'Test Loss: {test_loss}')

predicted_emissions = []
true_emissions = []

for i in range(len(test_inputs)):
    inputs = np.expand_dims(test_inputs[i], axis = 0)
    prediction = model.predict(inputs)
    true_emissions.append(test_outputs[i])
    predicted_emissions.append(prediction)

true_emissions = np.array(true_emissions)
predicted_emissions = np.array(predicted_emissions)

del test_inputs, test_outputs

true_emissions = true_emissions.reshape(predicted_emissions.shape)

csv_file = 'emiss_sim_cnn.csv'
if os.path.exists(csv_file):
    df = pd.read_csv(csv_file)
    
#df = pd.DataFrame({'True Emissions': true_emissions.flatten(), 'Predicted Emissions': predicted_emissions.flatten()})
df['Pred Emiss 2'] = predicted_emissions.flatten()
df.to_csv('emiss_sim_cnn.csv', index = False) 


In [None]:
import pandas as pd

df = pd.read_csv('emiss_sim_cnn.csv')
#
df['Average Emiss'] = ((df['Predicted Emissions'] + df['Pred Emiss 2'] + df['Pred Emiss 1']  + df['Pred Emiss 3'])/4).round(3) 

df['error'] = df['True Emissions'] - df['Average Emiss'] 

df['absolute_error'] = (abs(df['error'])).round(3)

df['relative_error'] = ((df['error'] / (df['True Emissions'] + 1e-15)) * 100).round(3)

df.to_csv('emiss_sim_cnn.csv', index = False)

In [None]:
import pandas as pd

df = pd.read_csv('emiss_sim_cnn.csv')

percentiles = df['absolute_error'].describe(percentiles = [0.25, 0.5, 0.75])
percentiles = percentiles.loc[['25%', '50%', '75%']].apply(abs) # 25% : 1.201, 50% : 2.674, 75% : 4.700 (All are in Mt/yr)
print(percentiles)



In [None]:
import pandas as pd

df = pd.read_csv('emiss_sim_cnn.csv')
df['abs_rel_error'] = abs(df['relative_error'])

percentiles = df['abs_rel_error'].describe(percentiles = [0.25, 0.5, 0.75])
percentiles = percentiles.loc[['25%', '50%', '75%']].apply(abs) # 25% : 7.367, 50% : 16.011, 75% : 27.763 (Unitless)
print(percentiles)


In [None]:
import pandas as pd

df = pd.read_csv('emiss_sim_cnn.csv')

abs = df['absolute_error']

in0_2 = (abs >= 0) & (abs <= 2)
in2_5 = (abs > 2) & (abs <= 5)
in5_10 = (abs > 5) & (abs <= 10)
ab10 = abs > 10
mean = round(abs.mean(), 3)
median = round(abs.median(), 3)
std = round(abs.std(), 3)

print("in between 0 and 2 : ", in0_2.sum()) # 897
print("in between 2 and 5 : ", in2_5.sum()) # 904
print("in between 5 and 10 : ", in5_10.sum()) # 463
print("above 10 : ", ab10.sum()) # 40
print("mean : ", mean) # 3.218
print("median : ", median) # 2.674
print("std : ", std) # 2.489

In [None]:
import pandas as pd

df = pd.read_csv('emiss_sim_cnn.csv')
df['abs_rel_error'] = abs(df['relative_error'])

rel = df['abs_rel_error']


above150 = (rel > 150)
in150_100 = (rel > 100) & (rel <= 150)
in100_50 = (rel > 50) & (rel <= 100)
in50_20 = (rel > 20) & (rel <= 50)
in20_0 = (rel > 0) & (rel <= 20)

mean = round(rel.mean(), 3)
median = round(rel.median(), 3)
std = round(rel.std(), 3)

print("above 150% : ", above150.sum()) # 0
print("in between 100% and 150% : ", in150_100.sum()) # 4
print("in between 50% and 100% : ", in100_50.sum()) # 128
print("in between 20% and 50% : ", in50_20.sum()) # 791
print("in between 0% and 20% : ", in20_0.sum()) # 1381
print("mean : ", mean) # 19.966
print("median : ", median) # 16.011
print("std : ", std) # 17.098

In [None]:
import pandas as pd

df = pd.read_csv('emiss_sim_cnn.csv')

true = df['True Emissions']
min_true = round(true.min(), 3)
max_true = round(true.max(), 3)
mean_true = round(true.mean(), 3)
std_true = round(true.std(), 3)
median_true = round(true.median(), 3)
range_true = round(max_true-min_true, 3)

pred = df['Average Emiss']
min_pred = round(pred.min(), 3)
max_pred = round(pred.max(), 3)
mean_pred = round(pred.mean(), 3)
std_pred = round(pred.std(), 3)
median_pred = round(pred.median(), 3)
range_pred = round(max_pred-min_pred, 3)

print("True statistics : ")
print()
print("min : ", min_true) # 7.493
print("max : ", max_true) # 30.114
print("mean : ", mean_true) # 17.09
print("std : ", std_true) # 4.517
print("median : ", median_true) # 16.725
print("range : ", range_true) # 22.621
print()
print("Predicted statistics : ")
print()
print("min : ", min_pred) # 14.071
print("max : ", max_pred) # 20.571
print("mean : ", mean_pred) # 16.614
print("std : ", std_pred) # 1.735
print("median : ", median_pred) # 16.412
print("range : ", range_pred) # 6.5

In [None]:
import numpy as np
import os
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Conv2D, LeakyReLU, Dropout, BatchNormalization, MaxPool2D, Flatten, Dense, Conv2DTranspose, UpSampling2D, Concatenate
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau

def unetreg(input_shape, dropout_rate = 0.2):
    inputs = tf.keras.Input(input_shape)
    
    c1 = Conv2D(64, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_uniform')(inputs) # Shape : (64, 64, 64)
    c1 = BatchNormalization()(c1)
    c1 = Conv2D(64, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_uniform')(c1) # Shape : (64, 64, 64)
    c1 = BatchNormalization()(c1)
    d1 = Dropout(dropout_rate)(c1) 
    p1 = MaxPool2D(pool_size = (2, 2))(d1) # Shape : (32, 32, 64) 
    
    c2 = Conv2D(128, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_uniform')(p1) # Shape : (32, 32, 128)
    c2 = BatchNormalization()(c2)
    c2 = Conv2D(128, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_uniform')(c2) # Shape : (32, 32, 128)
    c2 = BatchNormalization()(c2)
    d2 = Dropout(dropout_rate)(c2)
    p2 = MaxPool2D(pool_size = (2, 2))(d2) # Shape : (16, 16, 128)
    
    c3 = Conv2D(256, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_uniform')(p2) # Shape : (16, 16, 256)
    c3 = BatchNormalization()(c3)
    c3 = Conv2D(256, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_uniform')(c3) # Shape : (16, 16, 256)
    c3 = BatchNormalization()(c3)
    d3 = Dropout(dropout_rate)(c3)
    p3 = MaxPool2D(pool_size = (2, 2))(d3) # Shape : (8, 8, 256)

    # Bottleneck
    c4 = Conv2D(512, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_uniform')(p3) # Shape : (8, 8, 512)
    c4 = BatchNormalization()(c4)
    c4 = Conv2D(512, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_uniform')(c4) # Shape : (8, 8, 512)
    c4 = BatchNormalization()(c4)
    d4 = Dropout(dropout_rate)(c4)
    
    # Decoder (upsampling path)
    u5 = Conv2DTranspose(256, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_uniform')(d4) # Shape : (8, 8, 256)
    u5 = UpSampling2D((2, 2))(u5) # Shape : (16, 16, 256)
    concat5 = Concatenate()([u5, c3]) # Shape : (16, 16, 256)
    c5 = Conv2D(256, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_uniform')(concat5) # Shape : (16, 16, 256)
    c5 = BatchNormalization()(c5)
    c5 = Conv2D(256, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_uniform')(c5) # Shape : (16, 16, 256)
    c5 = BatchNormalization()(c5)
    d5 = Dropout(dropout_rate)(c5)
    
    u6 = Conv2DTranspose(128, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_uniform')(d5) # Shape : (16, 16, 128)
    u6 = UpSampling2D((2, 2))(u6) # Shape : (32, 32, 128)
    concat6 = Concatenate()([u6, c2]) # Shape : (32, 32, 128)
    c6 = Conv2D(128, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_uniform')(concat6) # Shape : (32, 32, 128)
    c6 = BatchNormalization()(c6)
    c6 = Conv2D(128, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_uniform')(c6) # Shape : (32, 32, 128)
    c6 = BatchNormalization()(c6)
    d6 = Dropout(dropout_rate)(c6)
    
    u7 = Conv2DTranspose(64, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_uniform')(d6) # Shape : (32, 32, 64)
    u7 = UpSampling2D((2, 2))(u7) # Shape : (64, 64, 64)
    concat7 = Concatenate()([u7, c1])  # Shape : (64, 64, 64)
    c7 = Conv2D(64, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_uniform')(concat7) # Shape : (64, 64, 64)
    c7 = BatchNormalization()(c7)
    c7 = Conv2D(1, 3, activation = 'relu', padding = 'same', kernel_initializer = 'glorot_uniform')(c7) # Shape : (64, 64, 1)
    c7 = BatchNormalization()(c7)
    
    # Output layer
    flat = Flatten()(c7) # Shape : 64*64*1
    output = Dense(1, activation = 'linear')(flat) # Shape : 1
    
    model = Model(inputs = inputs, outputs = output)
    return model
    
def rmse(y_true, y_pred):
    mse = tf.reduce_mean(tf.square(y_true - y_pred))
    return tf.sqrt(mse)
    
train_inputs = np.load('tr_in.npy')
train_outputs = np.load('tr_out.npy')
val_inputs = np.load('va_in.npy')
val_outputs = np.load('va_out.npy')
test_inputs = np.load('te_in.npy')
test_outputs = np.load('te_out.npy')

checkpoint_dir = './saved_models/' 
os.makedirs(checkpoint_dir, exist_ok = True)

check = os.path.join(checkpoint_dir, 'unet_reg_sim.h5')
check_call = ModelCheckpoint(filepath = check, save_weights_only = True, verbose = 1)

data_augmentation = ImageDataGenerator(rotation_range = 180, width_shift_range = 0.0, height_shift_range = 0.0, shear_range = 90, zoom_range = 0.2, horizontal_flip = True, vertical_flip = True, fill_mode = 'nearest')

model = unetreg(input_shape = (64, 64, 4))
# model.load_weights('saved_models/unet_reg_sim.h5')
optimizer = Adam(learning_rate = 1e-3)
lr_check = ReduceLROnPlateau(monitor = 'val_loss', factor = 0.75, patience = 30, verbose = 1, min_delta = 5e-3, cooldown = 0, min_lr = 5e-6)
model.compile(optimizer = optimizer, loss = tf.keras.losses.Huber())
# tf.keras.losses.Huber()
model.summary() # Around 5 million params

history = model.fit(data_augmentation.flow(train_inputs, train_outputs, batch_size = 32, shuffle = True), epochs = 150, steps_per_epoch = len(train_inputs) // 32,
                    validation_data = (val_inputs, val_outputs), validation_batch_size = 32,
                    callbacks = [lr_check, check_call])

test_loss = model.evaluate(test_inputs, test_outputs, batch_size = None)
print(f'Test Loss: {test_loss}')

del train_inputs, train_outputs, val_inputs, val_outputs

predicted_emissions = []
true_emissions = []
for i in range(len(test_inputs)):
    inputs = np.expand_dims(test_inputs[i], axis = 0)
    prediction = model.predict(inputs)
    true_emissions.append(test_outputs[i])
    predicted_emissions.append(prediction)

del test_inputs, test_outputs

true_emissions = np.array(true_emissions)
predicted_emissions = np.array(predicted_emissions)
true_emissions = true_emissions.reshape(predicted_emissions.shape)

#csv_file = 'emiss_sim_unet.csv'
#if os.path.exists(csv_file):
#    df = pd.read_csv(csv_file)
    
df = pd.DataFrame({'True Emissions': true_emissions.flatten(), 'Predicted Emissions': predicted_emissions.flatten()})
#df['Pred Emiss 2'] = predicted_emissions.flatten()
df.to_csv('emiss_sim_unet.csv', index = False)

In [None]:
import pandas as pd

df = pd.read_csv('emiss_sim_unet.csv')
# 
df['Average Emiss'] = ((df['Predicted Emissions'] + df['Pred Emiss 2'] + + df['Pred Emiss 3']+ df['Pred Emiss 1'])/4).round(3) 

df['error'] = df['True Emissions'] - df['Average Emiss'] 

df['absolute_error'] = (abs(df['error'])).round(3)

df['relative_error'] = ((df['error'] / (df['True Emissions'] + 1e-15)) * 100).round(3)

df.to_csv('emiss_sim_unet.csv', index = False)

In [None]:
import pandas as pd
df = pd.read_csv('emiss_sim_unet.csv')

percentiles = df['absolute_error'].describe(percentiles = [0.25, 0.5, 0.75])
percentiles = percentiles.loc[['25%', '50%', '75%']].apply(abs) # 25% : 1.129, 50% : 2.351, 75% : 3.997 (All are in Mt/yr)
print(percentiles)


In [None]:
import pandas as pd

df = pd.read_csv('emiss_sim_unet.csv')
df['abs_rel_error'] = abs(df['relative_error'])

percentiles = df['abs_rel_error'].describe(percentiles = [0.25, 0.5, 0.75])
percentiles = percentiles.loc[['25%', '50%', '75%']].apply(abs) # 25% : 7.084, 50% : 14.141, 75% : 23.980 (Unitless)
print(percentiles)



In [None]:
import pandas as pd

df = pd.read_csv('emiss_sim_unet.csv')

abs = df['absolute_error']

in0_2 = (abs >= 0) & (abs <= 2)
in2_5 = (abs > 2) & (abs <= 5)
in5_10 = (abs > 5) & (abs <= 10)
ab10 = abs > 10
mean = round(abs.mean(), 3)
median = round(abs.median(), 3)
std = round(abs.std(), 3)

print("in between 0 and 2 : ", in0_2.sum()) # 1016
print("in between 2 and 5 : ", in2_5.sum()) # 914
print("in between 5 and 10 : ", in5_10.sum()) # 340
print("above 10 : ", ab10.sum()) # 34
print("mean : ", mean) # 2.896
print("median : ", median) # 2.352
print("std : ", std) # 2.361

In [None]:
import pandas as pd

df = pd.read_csv('emiss_sim_unet.csv')
df['abs_rel_error'] = abs(df['relative_error'])

rel = df['abs_rel_error']


above150 = (rel > 150)
in150_100 = (rel > 100) & (rel <= 150)
in100_50 = (rel > 50) & (rel <= 100)
in50_20 = (rel > 20) & (rel <= 50)
in20_0 = (rel > 0) & (rel <= 20)

mean = round(rel.mean(), 3)
median = round(rel.median(), 3)
std = round(rel.std(), 3)

print("above 150% : ", above150.sum()) # 0
print("in between 100% and 150% : ", in150_100.sum()) # 3
print("in between 50% and 100% : ", in100_50.sum()) # 40
print("in between 20% and 50% : ", in50_20.sum()) # 740
print("in between 0% and 20% : ", in20_0.sum()) # 1521
print("mean : ", mean) # 16.907
print("median : ", median) # 14.141
print("std : ", std) # 13.277

In [None]:
import pandas as pd

df = pd.read_csv('emiss_sim_unet.csv')

true = df['True Emissions']
min_true = round(true.min(), 3)
max_true = round(true.max(), 3)
mean_true = round(true.mean(), 3)
std_true = round(true.std(), 3)
median_true = round(true.median(), 3)
range_true = round(max_true-min_true, 3)

pred = df['Average Emiss']
min_pred = round(pred.min(), 3)
max_pred = round(pred.max(), 3)
mean_pred = round(pred.mean(), 3)
std_pred = round(pred.std(), 3)
median_pred = round(pred.median(), 3)
range_pred = round(max_pred-min_pred, 3)

print("True statistics : ")
print()
print("min : ", min_true) # 7.493
print("max : ", max_true) # 30.114
print("mean : ", mean_true) # 17.09
print("std : ", std_true) # 4.517
print("median : ", median_true) # 16.725
print("range : ", range_true) # 22.621
print()
print("Predicted statistics : ")
print()
print("min : ", min_pred) # 9.528
print("max : ", max_pred) # 24.886
print("mean : ", mean_pred) # 15.819
print("std : ", std_pred) # 2.28
print("median : ", median_pred) # 15.564
print("range : ", range_pred) # 15.358