## Data preprocessing

In [2]:
# load necessary packages
import netCDF4 as nc
import numpy as np
import pandas as pd
import xarray as xr

### Temperature data

- This data is pre-processed in `main.ipynb` in order to avoid a large intermediate `.csv` file.
- Here we just extract the coordinates for later use.

In [3]:
# import NetCDF4 file
globalT = xr.open_dataset("../data/downloads/temperature_ORIG.nc")

# extract the `tempanomaly` variable (time, lat, lon)
globalT = globalT["tempanomaly"].to_dataframe(dim_order=["lat", "lon", "time"])["tempanomaly"]

# reset index
globalT = globalT.reset_index()

# extract lat/lon
coords = globalT.loc[:,"lat":"lon"].drop_duplicates()

# export to csv
coords.to_csv("../data/processed/temp_coordinates.csv", index=False)

### CO<sub>2</sub> Data

- Removes the preamble from the `.csv` files

In [4]:
# import data from .csv
world = pd.read_csv("../data/downloads/global_co2_ORIG.csv", sep=",", skiprows=55)
mauna = pd.read_csv("../data/downloads/mauna_co2_ORIG.csv", sep=",", skiprows=56)

# export to csv
world.to_csv("../data/processed/global_co2_PRO.csv", index=False)
mauna.to_csv("../data/processed/mauna_co2_PRO.csv", index=False)

### Volcano Data

- Counts the number of volcanic eruptions per year.

- Note: must copy the "Start Year" column from file `volcano_RAW.xls` to a `.csv` file.

In [6]:
# import data from .csv
volcano = pd.read_csv("../data/downloads/volcano_ORIG.csv")

# year column
years = volcano["year"]
# count number of eruptions recorded per year
counts = years.value_counts()
# combine years with counts
data = pd.DataFrame({'year': counts.index, 'count': counts.values})
# sort dataframe by year
data = data.sort_values("year", ascending=True)

# export to csv
data.to_csv("../data/processed/volcano_PRO.csv", index=False)

### Irradiance Data

- Converts the `time` variable from a decimal interval to years.
- Converts the `.nc` file to a more useable `.csv` file.

In [7]:
# import NetCDF4 file
data = nc.Dataset("../data/downloads/irradiance_ORIG.nc")

# create pandas series for each variable
TSI = pd.Series(data["TSI"][:])
TSI_UNC = pd.Series(data["TSI_UNC"][:])
time = pd.Series(data["time"][:])
time_bnds_lwr = pd.Series(data["time_bnds"][:,0])
time_bnds_upr = pd.Series(data["time_bnds"][:,1])

# year variable
year = np.arange(1610, 2023)

# create dictionary to convert to pandas DataFrame
for_df = {"year":year, "TSI":TSI, "TSI_UNC":TSI_UNC, \
         "time_bnds_lwr":time_bnds_lwr, "time_bnds_upr":time_bnds_upr}

# dataframe
df = pd.DataFrame(for_df)

# export dataframe to CSV
df.to_csv("../data/processed/irradiance_PRO.csv", index=False)