In [1]:
## import packages

import numpy as np
import pandas as pd 
import seaborn as sns 
import matplotlib.pyplot as plt
import tarfile
import glob
import xarray as xr
import time
import shutil

## Extracts only the u and v components for one month (one of the tar files)
135 lat, 236 lon, at 15 minute time intervals 

In [2]:
# file of interest

## created a list of all tar files. Need to have saved the downloaded files into folder titled ncar_data
file_list=glob.glob('ncar_data/*')
## choose tar file here. This is just arbitrarily the first one in the list
temp_file=file_list[0]

## intialize timer and dictionary to store u and v values
start_time=time.time()
u_dict={}
v_dict={}

# extract tarfile
with tarfile.open(temp_file, 'r') as tar:
    tar.extractall(path='ncar_extracted') 
## create list of all files within subfile.
## They separate into seperate files for USHR (u comp),VSHR (v comp), and SRH001 (storm relative helicity, which I ignored) every 15 minutes
subfile_list=glob.glob('ncar_extracted/'+temp_file[-10:-4]+'/*')

## iterate through list and add the slices for the u and v comps to the corresponding dictionaries
for subfile in subfile_list:
    if 'USHR1' in subfile:
        fp=xr.open_dataset(subfile)
        u_dict['u_'+subfile[-20:-3]]=xr.DataArray(data=np.empty(tuple(fp.sizes[dim] for dim in fp.dims)),dims=fp.dims,coords=fp.coords)
        fp.close()
    if 'VSHR1' in subfile:
        fp=xr.open_dataset(subfile)
        v_dict['v_'+subfile[-20:-3]]=xr.DataArray(data=np.empty(tuple(fp.sizes[dim] for dim in fp.dims)),dims=fp.dims,coords=fp.coords)
        fp.close()
        
## concatenate saved xarrays across the 15 minute intervals 
all_u=xr.concat(list(u_dict.values()), dim='Time')
all_v=xr.concat(list(v_dict.values()), dim='Time')

## combine the u and v components into 1 array
combined_array=xr.Dataset({'u':all_u,'v':all_v})

## save_file (can comment out)
combined_array.to_netcdf('cleaned_monthly/'+temp_file[-10:-4]+'.nc')
## remove the extracted files (the are really big)
shutil.rmtree('ncar_extracted/'+temp_file[-10:-4])

## print elasped time and size
end_time=time.time()
elapsed_time=end_time-start_time
file_size=combined_array.nbytes/1e9
print('The code took %.2f seconds, and the file is %.5f gigs'%(elapsed_time,file_size))

The code took 24.69 seconds, and the file is 0.37953 gigs


In [3]:
## extrapolate for all files
number_files=len(file_list)
total_time=(elapsed_time*number_files/3600)
total_size=file_size*number_files
print('If we run for all %i files, the code will take %.2f hours and  %.2f gigs of storage'%(number_files,total_time,total_size))

If we run for all 516 files, the code will take 3.54 hours and  195.84 gigs of storage
