# Climate Data Preprocessing

## Download data

In [1]:
import pandas as pd

pd.options.display.float_format = "{:,.2f}".format

In [2]:
import os
import glob

def unpack_data():

    data_dir = '../data/climate'

# Find all .tar.gz files in the data directory
    tar_files = glob.glob(os.path.join(data_dir, '*.zip'))

# Unpack and remove each .tar.gz file
    for tar_file in tar_files:
        os.system(f'tar -xf {tar_file} -C {data_dir}')
        os.remove(tar_file)

In [3]:
import cdsapi

def data_download(year):
    
    c = cdsapi.Client()
    c.retrieve(
        'insitu-gridded-observations-global-and-regional',
        {
            'origin': 'cru',
            'region': 'global',
            'variable': [
                'precipitation', 'temperature',
            ],
            'statistic': [
                'maximum', 'mean',
            ],
            'time_aggregation': 'monthly',
            'horizontal_aggregation': '0_5_x_0_5',
            'year': f'{year}',
            'version': 'v4.03',
            'format': 'zip',
        },
        f'../data/climate/climate_{year}.zip')
       
    return unpack_data()

In [4]:
start_year = 2000
end_year = 2000

for i in range(start_year, end_year + 1):
    data_download(i)

2023-06-13 09:45:18,376 INFO Welcome to the CDS
2023-06-13 09:45:18,378 INFO Sending request to https://cds.climate.copernicus.eu/api/v2/resources/insitu-gridded-observations-global-and-regional
2023-06-13 09:45:18,688 INFO Request is queued
2023-06-13 09:45:19,714 INFO Request is running
2023-06-13 09:45:21,249 INFO Request is completed
2023-06-13 09:45:21,258 INFO Downloading https://download-0010-clone.copernicus-climate.eu/cache-compute-0010/cache/data4/dataset-insitu-gridded-observations-global-and-regional-7c3a4b4f-51c0-4df6-9258-de6b7fec3e0e.zip to ../data/climate/climate_2000.zip (3.9M)
2023-06-13 09:45:22,592 INFO Download rate 3M/s     


## Select data

In [5]:
import xarray as xr
import pandas as pd

lat_low = 32.75
lat_high = 52
lon_low = -10.25
lon_high = 50

def climate_preprocessing(file_path):
    frame = xr.open_dataset(file_path, engine='netcdf4').to_dataframe()
    frame.reset_index(inplace=True)
    frame['year'] = frame['time'].dt.year
    frame['month'] = frame['time'].dt.month
    frame.drop('time', inplace=True, axis=1)  
    frame = frame[(frame['lat']>=lat_low) & (frame['lat']<=lat_high) & (frame['lon']>=lon_low) & (frame['lon']<=lon_high)]
    return frame

In [6]:
def merge_years(climate_var, data_dir):
    dfs = []
    for file_name in os.listdir(data_dir):
        if file_name.endswith('.nc') and climate_var in file_name:
            file_path = os.path.join(data_dir, file_name)
            df = climate_preprocessing(file_path)
            dfs.append(df)
    final_df = pd.concat(dfs, ignore_index=True)
    return final_df

In [7]:
data_dir = '../data/climate'

df_mean_temperature = merge_years('CRU_mean_temperature', data_dir)
#df_maximum_temperature = merge_years('CRU_maximum_temperature', data_dir)
df_total_precipitation = merge_years('CRU_total_precipitation', data_dir)

df_all = df_mean_temperature.copy()
df_all.drop('tas', inplace=True, axis=1)
df_all = pd.merge(df_all, df_mean_temperature, on=['lon','lat','month','year'])
#df_all = pd.merge(df_all, df_maximum_temperature, on=['lon','lat','month','year'])
df_all = pd.merge(df_all, df_total_precipitation, on=['lon','lat','month','year'])
df_all = df_all.sort_values(['lon','lat','year','month'])

df_all

Unnamed: 0,lon,lat,year,month,tas,pr
339768,-10.25,32.75,2000,1,,
344487,-10.25,32.75,2000,2,,
349206,-10.25,32.75,2000,3,,
353925,-10.25,32.75,2000,4,,
358644,-10.25,32.75,2000,5,,
...,...,...,...,...,...,...
604031,49.75,51.75,2019,8,19.60,30.20
608750,49.75,51.75,2019,9,12.90,25.50
613469,49.75,51.75,2019,10,9.50,24.20
618188,49.75,51.75,2019,11,-2.60,12.20


## Increase spatial resolution

In [22]:
df_all['constant'] = 0.25
df_all_bis = df_all.copy()
df_all_bis['lon'] = df_all_bis['lon'] + df_all_bis['constant']
df_all_tris = df_all.copy()
df_all_tris['lat'] = df_all_tris['lat'] + df_all_tris['constant']
df_all_quatris = df_all.copy()
df_all_quatris['lon'] = df_all_quatris['lon'] + df_all_tris['constant']
df_all_quatris['lat'] = df_all_quatris['lat'] + df_all_tris['constant']

dfs = [df_all, df_all_bis, df_all_tris, df_all_quatris]

df_final = pd.concat(dfs, ignore_index=True)
df_final = df_final.drop(['constant'], axis=1)
df_final = df_final[df_final.lon != lon_low]
df_final = df_final[df_final.lat != lat_low]
df_final = df_final.sort_values(['lon','lat','year','month'])
df_final = df_final.reset_index()
df_final = df_final.drop(['index'], axis=1)
df_final = df_final.rename(columns={'tas':'temperature', 'pr':'precipitation'})
df_final.head(15)

Unnamed: 0,lon,lat,year,month,temperature,precipitation
0,-10.0,33.0,2000,1,,
1,-10.0,33.0,2000,2,,
2,-10.0,33.0,2000,3,,
3,-10.0,33.0,2000,4,,
4,-10.0,33.0,2000,5,,
5,-10.0,33.0,2000,6,,
6,-10.0,33.0,2000,7,,
7,-10.0,33.0,2000,8,,
8,-10.0,33.0,2000,9,,
9,-10.0,33.0,2000,10,,


In [17]:
df_final['lon_bounds'].unique().shape[0] * df_final['lat_bounds'].unique().shape[0] * 20 * 12

4453680

In [25]:
df_final['temperature1'] = df_final['temperature'].shift(1)
df_final['precipitation1'] = df_final['precipitation'].shift(1)
df_final['temperature2'] = df_final['temperature1'].shift(1)
df_final['precipitation2'] = df_final['precipitation1'].shift(1)
df_final['temperature3'] = df_final['temperature2'].shift(1)
df_final['precipitation3'] = df_final['precipitation2'].shift(1)
df_final['temperature4'] = df_final['temperature3'].shift(1)
df_final['precipitation4'] = df_final['precipitation3'].shift(1)
df_final

Unnamed: 0,lon,lat,year,month,temperature,precipitation,temperature1,precipitation1,temperature2,precipitation2,temperature3,precipitation3,temperature4,precipitation4
0,-10.00,33.00,2000,1,,,,,,,,,,
1,-10.00,33.00,2000,2,,,,,,,,,,
2,-10.00,33.00,2000,3,,,,,,,,,,
3,-10.00,33.00,2000,4,,,,,,,,,,
4,-10.00,33.00,2000,5,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4453675,50.00,52.00,2019,8,19.60,30.20,22.40,40.50,22.40,22.80,18.80,29.50,8.90,19.20
4453676,50.00,52.00,2019,9,12.90,25.50,19.60,30.20,22.40,40.50,22.40,22.80,18.80,29.50
4453677,50.00,52.00,2019,10,9.50,24.20,12.90,25.50,19.60,30.20,22.40,40.50,22.40,22.80
4453678,50.00,52.00,2019,11,-2.60,12.20,9.50,24.20,12.90,25.50,19.60,30.20,22.40,40.50


In [24]:
df_end = df_final[df_final['year'] != 2000]
df_end

Unnamed: 0,lon,lat,year,month,temperature,precipitation,temperature1,precipitation1,temperature2,precipitation2,temperature3,precipitation3,temperature4,precipitation4
12,-10.00,33.00,2001,1,,,,,,,,,,
13,-10.00,33.00,2001,2,,,,,,,,,,
14,-10.00,33.00,2001,3,,,,,,,,,,
15,-10.00,33.00,2001,4,,,,,,,,,,
16,-10.00,33.00,2001,5,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4453675,50.00,52.00,2019,8,19.60,30.20,22.40,40.50,22.40,22.80,18.80,29.50,8.90,19.20
4453676,50.00,52.00,2019,9,12.90,25.50,19.60,30.20,22.40,40.50,22.40,22.80,18.80,29.50
4453677,50.00,52.00,2019,10,9.50,24.20,12.90,25.50,19.60,30.20,22.40,40.50,22.40,22.80
4453678,50.00,52.00,2019,11,-2.60,12.20,9.50,24.20,12.90,25.50,19.60,30.20,22.40,40.50


## Save data in CSV file

In [26]:
df_end.to_csv('../data/climate.csv')