In [315]:
import numpy as np
import pandas as pd
import os

def get_gsod_dataframe(filename):

    col_names = ['STN---', 'WBAN', 'YEARMODA', 'TEMP', 'nTEMP', 'DEWP', 'nDEWP', 'SLP', 'nSLP', 'STP', 'nSTP', 'VISIB', 'nVISIB', 'WDSP', 'nWDSP', 'MXSPD', 'GUST', 'MAX', 'MIN', 'PRCP', 'SNDP', 'FRSHTT']
    df = pd.read_csv("/Users/pablo/Downloads/gsod_2015/{}".format(filename), compression='gzip', header=None, names=col_names, skiprows=1, index_col=2, dtype=object, sep=r'\s{1,}', parse_dates=[2])
    df = df.reindex(pd.date_range('01-01-2015', '31-12-2015'), fill_value=np.NaN)
    #df.index.names = ['time']
    df = df.drop(['STN---', 'WBAN'], axis=1)
    df['TEMP'] = df['TEMP'].replace('9999.9',np.NaN)
    df['TEMP'] = df['TEMP'].astype(np.float32)
    df['nTEMP'] = df['nTEMP'].replace(np.nan, 255).astype(np.uint8)
    df['DEWP'] = df['DEWP'].replace('9999.9',np.NaN)
    df['DEWP'] = df['DEWP'].astype(np.float32)
    df['nDEWP'] = df['nDEWP'].replace(np.nan, 255).astype(np.uint8)
    df['SLP'] = df['SLP'].replace('9999.9',np.NaN)
    df['SLP'] = df['SLP'].astype(np.float32)
    df['nSLP'] = df['nSLP'].replace(np.nan, 255).astype(np.uint8)
    df['STP'] = df['STP'].replace('9999.9',np.NaN)
    df['STP'] = df['STP'].astype(np.float32)
    df['nSTP'] = df['nSTP'].replace(np.nan, 255).astype(np.uint8)
    df['VISIB'] = df['VISIB'].replace('9999.9', np.NaN).replace('999.9',np.NaN)
    df['VISIB'] = df['VISIB'].astype(np.float32)
    df['nVISIB'] = df['nVISIB'].replace(np.nan, 255).astype(np.uint8)
    df['WDSP'] = df['WDSP'].replace('999.9',np.NaN)
    df['WDSP'] = df['WDSP'].astype(np.float32)
    df['nWDSP'] = df['nWDSP'].replace(np.nan, 255).astype(np.uint8)
    df['MXSPD'] = df['MXSPD'].replace('999.9', np.NaN)
    df['MXSPD'] = df['MXSPD'].astype(np.float32)
    df['GUST'] = df['GUST'].replace('999.9', np.NaN)
    df['GUST'] = df['GUST'].astype(np.float32)
    df['MAX'] = df['MAX'].replace('9999.9', np.NaN)
    df['MAX'] = df['MAX'].map(lambda x: str(x).rstrip('*'))
    df['MAX'] = df['MAX'].astype(np.float32)
    df['MIN'] = df['MIN'].replace('9999.9', np.NaN)
    df['MIN'] = df['MIN'].map(lambda x: str(x).rstrip('*'))
    df['MIN'] = df['MIN'].astype(np.float32)
    df['PRCP'] = df['PRCP'].replace('999.9', np.NaN)
    df['tPRCP'] = df['PRCP'].map(lambda x: ord(str(x)[-1])-65)
    df['tPRCP'] = df['tPRCP'].astype(np.uint8)
    df['PRCP'] = df['PRCP'].map(lambda x: str(x)[:-1] if str(x) != "nan" else x)
    df['PRCP'] = df['PRCP'].astype(np.float32)
    df['SNDP'] = df['SNDP'].replace('999.9',np.NaN)
    df['SNDP'] = df['SNDP'].astype(np.float32)



    return df

In [None]:
import sys
import pickle

d = {}

#i = 0
stations = []
for filename in os.listdir("/Users/pablo/Downloads/gsod_2015/"):
    if filename.endswith(".gz") or filename.endswith(".py"): 
        #print("Starting with", filename)
        #if i == 1000:
            #break
        
        stations = stations + [int(filename[:6])]
        d[int(filename[:6])] = get_gsod_dataframe(filename)
        #i += 1

#mydict_as_string = pickle.dumps(d)
#sys.getsizeof(mydict_as_string)/1024

  


In [313]:
import netCDF4

with netCDF4.Dataset("/Users/pablo/Downloads/B010010.nc", 'w', format='NETCDF4') as dest:
    t_dim = dest.createDimension("time", 365)
    station_dim = dest.createDimension("station", len(list(d.keys())))


    var = dest.createVariable("time", "f8", ("time",))
    var.units = "seconds since 1970-01-01 00:00:00.0"
    var.calendar = "standard"
    var.long_name = "Time, unix time-stamp"
    var.standard_name = "time"
    var[:] = netCDF4.date2num([datetime.fromtimestamp(t // 1000000000) for t in df.index.values.tolist()], units="seconds since 1970-01-01 00:00:00.0", calendar="standard")
        
    var = dest.createVariable("station", "i2", ("station",))
    var.long_name = "WMO Station ID"
    var.standard_name = "station"
    var[:] = np.array(list(d.keys()))

    var = dest.createVariable("precip", "f4", ("time", "station"), fill_value=np.nan)
    var.long_name = "24h precipitation"
    var.units = 'mm'
    arr = np.zeros((365, len(list(d.keys()))))
    
    i=0
    for key, df in d.items():
        arr[:,i]=df['PRCP'].values
        i+=1
    
    var[:] = arr
    
    var = dest.createVariable("t_precip", "i", ("time", "station"), fill_value=255)
    var.long_name = "24h precipitation accumulation mode"
    arr = np.zeros((365, len(list(d.keys()))))
    
    i=0
    for key, df in d.items():
        arr[:,i]=df['tPRCP'].values
        i+=1
    
    var[:] = arr
    
    
    var = dest.createVariable("mean_temp", "f4", ("time", "station"), fill_value=np.nan)
    var.long_name = "24h mean temperature"
    var.units = 'F'
    arr = np.zeros((365, len(list(d.keys()))))
    
    i=0
    for key, df in d.items():
        arr[:,i]=df['TEMP'].values
        i+=1
    
    var[:] = arr
    
    

In [None]:
import xarray as xr

ds = xr.Dataset.from_dataframe(df)
ds.time.encoding['units'] = 'days since 1970-01-01'
ds.time.encoding['standard_name'] = "time"
print(ds.time.encoding)
ds.to_netcdf(path="/Users/pablo/Downloads/010010.nc", mode='w')#, unlimited_dims=["time"])