In [7]:
import numpy as np
import pandas as pd
import xarray as xr
import datetime as dt

import os
import re
import urllib.request
from urllib.request import Request, urlopen, urlretrieve
from bs4 import BeautifulSoup
from glob import glob


In [8]:
# pull HTML index file and make it into a list
# (modified from https://stackoverflow.com/questions/11023530/python-to-list-http-files-and-directories)
# filter list by .hdf files and extract those filenames
# open each of these .hdf files with xarray via urlretrieve()

def get_data_urls(url):
    url = url.replace(" ","%20")
    req = Request(url)
    a = urlopen(req).read()
    soup = BeautifulSoup(a, 'html.parser')
    x = (soup.find_all('a'))
    text_list = []
    for i in x:
        file_name = i.extract().get_text()
        url_new = url + file_name
        url_new = url_new.replace(" ","%20")
        text_list.append(url_new)
    datafile_list = list(filter(re.compile(".*hdf").match, text_list)) # Read Note
  
    return(datafile_list)

        

    

In [9]:
# function to open a .hdf datafile from the Wimsoft site by running it through urlretrieve()

def pull_hdf(url):
    local_filename, headers = urllib.request.urlretrieve(url)
    return(xr.open_dataset(local_filename))


In [10]:
# function to clean up the .hdf datafile from the Wimsoft site 

def clean_hdf(dataset, year, doy):
    var_name = str('sst_' + str(year) + doy)
    
    # add lat/long coordinates
    dataset['fakeDim0'] = sst_coords['Latitude'].values[:,0]
    dataset['fakeDim1'] = sst_coords['Longitude'].values[0,:]
    
    # rename lat, long, and sst variables
    ds = dataset.rename({'fakeDim0': 'lat',
                         'fakeDim1': 'lon',
                         var_name: 'sst'})
    
    # create time and add it as a dimension to the hdf
    date = dt.datetime(int(year), 1, 1) + dt.timedelta(int(doy) -1)
    ds.coords['time'] = date
    ds['sst'] = ds['sst'].assign_coords(time = date)
    
    # filter and convert sst values based on documentation
    # data should be unsigned ints but are imported as signed
    # to fix, negative values should have a constant 256 added to them
    # then remove values of 0 and 255, which are invalid
    ds = (ds.where(ds['sst'] > 0, ds['sst'] + 256)
           .where((ds['sst'] != 0) & (ds['sst'] != 1) & (ds['sst'] != 255))
         )
    
    # convert sst pixel values to degC
    ds['sst'] = 0.15*ds['sst'] - 3.0

    
    return(ds)

In [11]:
# technically, should just be able to open this directly from Wimsoft but my connection is slow af
# so I'm just going to load it from local disk instead  ¯\_(ツ)_/¯

# sst_coords = pull_hdf("http://wimsoft.com/CAL/files/cal_aco_3840_Latitude_Longitude.hdf")

sst_coords = xr.open_dataset("../../../Raw_Data/Wimsoft_SST/cal_aco_3840_Latitude_Longitude.hdf")





In [12]:
# Download all the SST data, write them to netCDFs by year

# year_list = [str(yr) for yr in range(2000, 2021)]

year_list = [str(yr) for yr in range(2015, 2020)]

for i, year in enumerate(year_list):
    index_url = str('https://www.wimsoft.com/CAL/'+ year + '/M' + year + '_sst_day/')
    url_list = get_data_urls(index_url)
    print('found {} data files for {}'.format(len(url_list), year))
    for j, url in enumerate(url_list): 
        doy = url.split('_')[2][-3:]   
        hdf = pull_hdf(url)
        ds = clean_hdf(hdf, year, doy)        
        if j == 0:  
            ds_all = ds
        else:
            ds_all = xr.concat([ds_all, ds], dim = 'time')
        print('concatenating {}_{}'.format(year, doy))
            
    ds_all = ds_all.sortby('time')
    print('writing year {} to netcdf'.format(year))
    ds_all.to_netcdf('SST_winsoft_dataset/merged_sst_' + str(year) + '2.nc', mode = 'w')

found 365 data files for 2015
concatenating 2015_001
concatenating 2015_002
concatenating 2015_003
concatenating 2015_004
concatenating 2015_005
concatenating 2015_006
concatenating 2015_007
concatenating 2015_008
concatenating 2015_009
concatenating 2015_010
concatenating 2015_011
concatenating 2015_012
concatenating 2015_013
concatenating 2015_014
concatenating 2015_015
concatenating 2015_016
concatenating 2015_017
concatenating 2015_018
concatenating 2015_019
concatenating 2015_020
concatenating 2015_021
concatenating 2015_022
concatenating 2015_023
concatenating 2015_024
concatenating 2015_025
concatenating 2015_026
concatenating 2015_027
concatenating 2015_028
concatenating 2015_029
concatenating 2015_030
concatenating 2015_031
concatenating 2015_032
concatenating 2015_033
concatenating 2015_034
concatenating 2015_035
concatenating 2015_036
concatenating 2015_037
concatenating 2015_038
concatenating 2015_039
concatenating 2015_040
concatenating 2015_041
concatenating 2015_042
conc

KeyboardInterrupt: 

In [None]:
test_sst['fakeDim0'] = sst_coords['Latitude'].values[:,0]
test_sst['fakeDim1'] = sst_coords['Longitude'].values[0,:]

# test_sst.rename({fakeDim0': 'lat', 'fakeDim1': 'lon',})

In [None]:
'https://www.wimsoft.com/CAL/2011/M2011_sst_day/M2011001_sst_comp.hdf'.split('_')[2][-3:]

In [None]:
test_sst_2011 = pull_hdf('https://www.wimsoft.com/CAL/2011/M2011_sst_day/M2011001_sst_comp.hdf')

In [None]:
test_sst_2012 = pull_hdf('https://www.wimsoft.com/CAL/2012/M2012_sst_day/M2012001_sst_comp.hdf')

In [None]:
test_sst_2013 = pull_hdf('https://www.wimsoft.com/CAL/2013/M2013_sst_day/M2013001_sst_comp.hdf')

In [None]:
'https://www.wimsoft.com/CAL/2013/M2013_sst_day/M2013001_sst_comp.hdf'.split('_')[2][-3:]