In [None]:
import os

import pandas as pd
import netCDF4 as nc
import numpy as np

In [None]:
DATA_FILE_DIR = './data/'

START_YEAR, END_YEAR = 2010, 2020

NUM_OF_YEARS = END_YEAR - START_YEAR

NUM_OF_MONTHS = 12

NUM_OF_DAYS = {1: 31, 2: 28, 3: 31, 4: 30, 5: 31, 6: 30, 7: 31, 8: 31, 9: 30, 10: 31, 11: 30, 12: 31,}

In [None]:
file = nc.Dataset('./data/20110101.nc4')

In [None]:
lat = file.variables['lat'][:].filled()
lon = file.variables['lon'][:].filled()
tmp = file.variables['AvgSurfT_tavg'][0]
num_of_loc = tmp.size - tmp.mask.sum()
file.close()

In [None]:
LAT = lat.size
LON = lon.size

In [None]:
locations = np.zeros(shape=(num_of_loc, 2))
count = 0
for i in range(LAT):
    for j in range(LON):
        if not tmp.mask[i, j]:
            locations[count] = [lat[i], lon[j]]
            count += 1
yearly_mean = pd.DataFrame(locations, columns=['lat', 'lon'], dtype='float32')

In [None]:
def get_tmp(filepath):
    assert os.path.isfile(filepath), '{} does not exist!'.format(filepath)
    
    file = nc.Dataset(filepath)
    temperature = file.variables['AvgSurfT_tavg'][0]
    file.close()
    return temperature.filled(np.nan)

In [None]:
%%time
date_temp = pd.DataFrame()
for month in range(1, NUM_OF_MONTHS+1):
    for day in range(1, NUM_OF_DAYS[month]+1):
        date = '{:02d}{:02d}'.format(month, day)
        print(date)
        temps = np.ndarray(shape=(NUM_OF_YEARS, LAT, LON))
        for year in range(START_YEAR, END_YEAR):
            day = str(year) + date
            filepath = DATA_FILE_DIR + day + '.nc4'
            temps[year-START_YEAR] = get_tmp(filepath)
            
        mean_temps = np.nanmean(temps, axis=0)
        mean_temps = mean_temps.reshape(-1)
        date_temp[date] = mean_temps[~np.isnan(mean_temps)]

date_temp -= 273.15
date_temp = date_temp.round(decimals=1)

yearly_mean = pd.concat([yearly_mean, date_temp], axis=1)

In [None]:
os.makedirs('./processed_mean/', exist_ok=True)
yearly_mean.to_csv('./processed_mean/processed_yearly_avg.csv', index=False)