In [36]:
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# LAB MODULE 2.
# Observational data in climate science
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

# Topics covered:
# (1) Organization of geospatial data in python
# (2) Operations in time
# (3) Operations in space 

# The purpose of this exercise is to (partially) replicate the process of 
# building of a global mean tempertaure anomaly curve from observations 

# Links: 
#   https://journals.ametsoc.org/doi/pdf/10.1175/1520-0450%281986%29025%3C0161%3ANHSATV%3E2.0.CO%3B2
#   https://www.metoffice.gov.uk/hadobs/crutem4/data/download.html  >>>  CRUTEM.4.6.0.0.station_files.zip

In [37]:
# Libraries

import os

# A bunch of other libraries that may be useful

import numpy as np
import pandas as pd
import geopandas as gpd

import datetime as dtm
from datetime import datetime

import matplotlib.pyplot as plt
import cartopy.crs as ccrs

Qua vengono caricate tutte le informazioni sul dataset

In [38]:
#-------------------------------------------------
#-- get info on database and files organization --
#-------------------------------------------------

flist = [os.path.join(path, name) for path, subdirs, files in os.walk("./CRUTEM.4.6.0.0.station_files/") for name in files]
for i in range(0,5):
    print (flist[i])

# exclude first 2 items, not relevant
flist=flist[2:]
print ('\n',flist[0:5])

nst=len(flist)
print ("\n > Number of stations = ",nst)

./CRUTEM.4.6.0.0.station_files/Index
./CRUTEM.4.6.0.0.station_files/61/616120
./CRUTEM.4.6.0.0.station_files/61/612913
./CRUTEM.4.6.0.0.station_files/61/612770
./CRUTEM.4.6.0.0.station_files/61/614010

 ['./CRUTEM.4.6.0.0.station_files/61/612913', './CRUTEM.4.6.0.0.station_files/61/612770', './CRUTEM.4.6.0.0.station_files/61/614010', './CRUTEM.4.6.0.0.station_files/61/618470', './CRUTEM.4.6.0.0.station_files/61/614420']

 > Number of stations =  10295


Notiamo quindi che abbiamo circa 10k stazioni

In [39]:
# Now let's have a look at one of them
# You can open the text files direclty with a text editor 
# Check as well the template with explanations: "crutem4_data_station_file_format.txt"

In [40]:
# Task 1 
# We need to import the data in python and organize them for further analyses and operations, both in time and space.

# Try to start thinking of a suitable way to organize this dataset, consisting of ~10,000 stations, each one having:
# - a unique identifier
# - string metadata (e.g. name, country)
# - numerical metadata (e.g. latitude, longitude, elevation) >> lat & lon are also the spatial dimensions we'll need
# - other metadata
# - a time series at monthly resolution oragnized as 2D tabs (year, month)

# Consider the data structures you know of, keeping in mind the final purpose that will require somehow 
# aggregating this dataset in time and space. 

# Bear in mind that this is real raw data that could have incomplete/erroneous file coding etc.
# Other than that, we will assume that all the reported data are correct and directly usable, 
#   considering the reality that time series have missing data and different time span.

In [41]:
# We will consider the actual time & space operation in the next stages.

Questi sono dati molto molto grezzi, come prima cosa cerchiamo di capire come aprirli, assumiamo che onon ci siano errori nel modo in cui sono stati acquisiti i dati, poi ce ne occuperemo.

In [42]:
dati = pd.read_csv("CRUTEM.4.6.0.0.station_files/01/010010", sep = ",")

In [43]:
metadati = dati.iloc[0:19]
metadati_corretti = pd.DataFrame(metadati['Number= 010010'].str.split('=',1).tolist(),
                                 columns = ['metadato','description']) 

In [44]:
metadati_corretti

Unnamed: 0,metadato,description
0,Name,Jan Mayen
1,Country,NORWAY
2,Lat,70.9
3,Long,8.7
4,Height,10
5,Start year,1921
6,End year,2020
7,First Good year,1921
8,Source ID,56
9,Source file,Jones


In [60]:
def opens_file(file):
    file_aperto = pd.read_csv(file, sep = '\s+', skiprows=21, engine = 'python', header = None)
    file_aperto_2 = pd.read_csv(file, sep = ",")
    osservazioni = file_aperto.iloc[21:]
    metadati = file_aperto_2.iloc[0:19]
    metadati_corretti = pd.DataFrame(metadati.iloc[:,0].str.split('=',1).tolist(),
                                 columns = ['metadato','description']) 
    matadati_dict = metadati_corretti.set_index('metadato').to_dict()
    matadati_dict['filename'] = str(file)
    matadati_dict['osservazioni'] = osservazioni
    return matadati_dict

In [87]:
metadati_tot = [opens_file(file) for file in flist[0:10]]

In [94]:
metadati_tot[4][1]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
21,1962,22.8,23.1,25.9,25.2,25.7,28.1,27.8,29.3,29.5,...,506,506,506,506,506,506,506,506,506,506
22,1963,23.5,23.6,26.3,24.8,28.1,28.3,28.8,29.8,30.7,...,506,506,506,506,506,506,506,506,506,506
23,1964,21.2,23.2,24.9,25.3,26.5,26.6,27.9,28.0,28.4,...,506,506,506,506,506,506,506,506,506,506
24,1965,18.6,21.8,25.2,24.2,27.2,25.6,27.0,27.9,29.4,...,506,506,506,506,506,506,506,506,506,506
25,1966,21.1,24.1,24.3,25.6,26.0,28.2,26.6,28.6,29.1,...,506,506,506,506,506,506,506,506,506,506
26,1967,22.1,23.7,24.7,23.1,26.6,27.5,26.9,28.7,29.3,...,506,506,506,506,506,506,506,506,506,506
27,1968,19.0,21.1,23.0,24.1,24.7,25.8,26.6,28.1,30.0,...,506,506,506,506,506,506,506,506,506,506
28,1969,22.9,26.0,26.1,26.3,29.3,27.5,28.2,27.8,29.9,...,506,506,506,506,506,506,506,506,506,506
29,1970,22.8,25.0,25.1,27.1,22.4,29.2,26.3,28.6,29.6,...,506,506,506,506,506,506,506,506,506,506
30,1971,21.8,21.5,21.1,22.8,23.8,-99.0,-99.0,27.4,28.7,...,506,506,506,-99,-99,506,506,-99,506,-99
