# Large data set

* DINEOF analysis of Western Mediterranean sea surface temperature.
* Download file and view content of NetCDF file with the following:

In [None]:
using PyPlot
using NCDatasets
using Missings

Helper function for plotting tranposed arrays or arrays with missing data 

In [None]:
using PyCall
using PyCall: PyObject

# allow for plotting with missing values
function PyObject(a::Array{Union{T,Missing},N}) where {T,N}
    numpy_ma = PyCall.pyimport("numpy").ma
    pycall(numpy_ma.array, Any, coalesce.(a,zero(T)), mask=ismissing.(a))
end


Download data file

In [None]:
if !isfile("WesternMedSST.nc")
    download("https://dox.ulg.ac.be/index.php/s/XkNUzGGVtnSCdT3/download","WesternMedSST.nc")
end
    
Dataset("WesternMedSST.nc")

# Useful functions

 * display the content of a NetCDF file.
```julia
Dataset("WesternMedSST.nc")
```

 * Read a variable from a NetCDF file.
```julia
ds = Dataset("WesternMedSST.nc")
SST = ds["seviri_sst_filled"][:]
close(ds)
```

More info at https://github.com/Alexander-Barth/NCDatasets.jl

# Example

Load modules and setup some helper functions

In [None]:
using NCDatasets
using PyPlot

# Helper function for dateticks
function datetick(axis,fmt = "%Y-%m-%d")
    ax = gca()
    formatter = matplotlib[:dates][:DateFormatter](fmt)

    if axis == :x
        ax[:xaxis][:set_major_formatter](formatter)
    else
        ax[:yaxis][:set_major_formatter](formatter)
    end
end



In [None]:
using DelimitedFiles
sl  = readdlm("8762075.sealevel.txt",comments=true, comment_char='%')


In [None]:
using Dates
DateTime([sl[1,1:5]; 0]...) 


In [None]:

DateTime(sl[1,1],sl[1,2],sl[1,3],sl[1,4],sl[1,5],0)

In [None]:
[i^2 for i = 1:10]

In [None]:
size(sl,1)

In [None]:
t = [DateTime(sl[i,1],sl[i,2],sl[i,3],sl[i,4],sl[i,5],0) 
    for i = 1:size(sl,1)];


In [None]:
plot(t[:],sl[:,6],"g")


In [None]:
fname = "WesternMedSST.nc";
ds = Dataset(fname)
lon = ds["lon"][:];
lat = ds["lat"][:];
times = nomissing(ds["time"][:]);
SST = ds["seviri_sst"][:];
mask = ds["mask"][:];
close(ds)


In [None]:
times

In [None]:
SSTdiff = SST[:,:,2] - SST[:,:,1]
pcolor(lon,lat,SST[:,:,2]'); colorbar();

In [None]:
SSTdiff = SST[:,:,2] - SST[:,:,1]
pcolor(lon,lat,SSTdiff'); colorbar();

### Plot the first time instance of the data set with pcolor.

In [None]:
ds = Dataset("WesternMedSST.nc")
lon = ds["lon"][:]
lat = ds["lat"][:]
close(ds)

pcolor(lon,lat,SST[:,:,1]');

In [None]:
pcolor(lon,lat,SST[:,:,1]');
colorbar();


In [None]:
SST[1,1,1]

In [None]:
ismissing(SST[1,1,1])

In [None]:
k = 1
count = 0
if !ismissing(SST[1,1,k])
    global count
    # do something
    count = count+1
end
count

### Plot the percentage of valid data grid point over time.

In [None]:
imax = size(SST,1)
jmax = size(SST,2)
kmax = size(SST,3)


In [None]:
imax, jmax, kmax = size(SST)

In [None]:
nbpixels = zeros(kmax)

for k = 1:kmax
  count = 0

  for j = 1:jmax
    for i = 1:imax
       if !ismissing(SST[i,j,k])
         count = count + 1
       end        
    end
        
    nbpixels[k] = count
  end
end
percentage = 100 * nbpixels/(imax*jmax)


In [None]:
plot(percentage);

In [None]:
sum(sum(.!ismissing.(SST[:,:,1]),dims = 1),dims = 2)

In [None]:

sum(.!ismissing.(SST[:,:,1]),dims = [1,2])

In [None]:
count = zeros((imax,jmax))
for j = 1:jmax
    for i = 1:imax
        for k = 1:kmax
            if !ismissing(SST[i,j,k])
                count[i,j] = count[i,j] + 1
            end
        end

        count[i,j] = 100*count[i,j]/kmax
    end
end


In [None]:
count = 100 * dropdims(sum(.!ismissing.(SST),dims = 3),dims = 3) / kmax;


In [None]:
pcolor(lon,lat,count'), colorbar();


### For all time instances, what is the percentage of sea grid points not covered by clouds?

In [None]:
count = zeros(Int,(kmax,))
for k = 1:kmax
    for j = 1:jmax
        for i = 1:imax
            if !ismissing(SST[i,j,k])
                count[k] = count[k] + 1
            end
        end
    end
end


In [None]:
count[1:4]

In [None]:
count = dropdims(sum(sum(.!ismissing.(SST),dims = 1),dims = 2), dims= (1,2))
count[1:4]

In [None]:
percentage = 100 * count / sum(mask)
percentage[1:10]

In [None]:
times[1:5]

In [None]:
plot(times,percentage);
datetick(:x,"%m-%d")


### Plot the time average of SST

In [None]:
SST2 = copy(SST);
SST2[ismissing.(SST)] .= 0;
count = sum(.!ismissing.(SST),dims = 3)
meanSST = allowmissing(sum(SST2,dims = 3) ./ count);
meanSST[count .== 0] .= missing;


pcolor(lon,lat,meanSST[:,:,1]');   colorbar()


### Plot the space average of SST 

* assuming that all pixels have the same area

In [None]:
meanSSTt = sum(sum(SST2,dims = 1),dims = 2) ./ sum(sum(.!ismissing.(SST),dims = 1),dims = 2);
meanSSTt = dropdims(meanSSTt,dims = (1,2));
plot(times,meanSSTt)
datetick(:x,"%m-%d")


### Make a time serie with the number of pixels with the temperature larger than 25 degree Celsius.

In [None]:
count = zeros((kmax,))
for k = 1:kmax
    for j = 1:jmax
        for i = 1:imax
            if !ismissing(SST[i,j,k]) && (SST[i,j,k] > 25)
                count[k] = count[k] + 1
            end
        end
    end
end


In [None]:
count[1:5]

In [None]:
plot(times,count)
datetick(:x,"%m-%d")


### Make a time serie of the area (in km2) with the temperature larger than 25 degree Celsius

In [None]:
# Earth Radius (in km)
R = 6371;
# surface of each cell
dx = pi * 0.05 * R/180;
dy = pi * 0.05 * R/180 * cos.(pi*lat/180);


In [None]:
area = zeros((kmax,))
for k = 1:kmax
    for j = 1:jmax
        for i = 1:imax
            if !ismissing(SST[i,j,k]) && (SST[i,j,k] > 25)
                area[k] = area[k] + dx * dy[j]
            end
        end
    end
end


In [None]:
plot(times,area)
datetick(:x,"%m-%d")


In [None]:
average = zeros(imax,jmax)
  
for i = 1:imax
    for j = 1:jmax
       mymean = 0.
       count = 0

       for k = 1:kmax
           if !ismissing(SST[i,j,k])
              mymean = mymean + SST[i,j,k]
              count = count+1
            end
        end
        
        mymean = mymean/count
        average[i,j] = mymean   
    end
end



In [None]:
pcolor(average'); colorbar();


Ideas for exercises
* Compute the mean over time for every pixel
* Compute the standard deviation over time for every pixel
* Make a map with the minimum temperature
* Make a map with the time index at which the temperature is minimum
