In [2]:
import requests

import pandas as pd

All GPM data (in data/gpm) is downloaded from Giovanni prior to this EDA.

Relevant metadata to replicate datasets is provided as metadata in each gpm csv.

In [98]:
# Let's analyze a single gpm csv prior to processing all of them into a final precipitation dataset.

# Skip initial metadata
df = pd.read_csv('data/gpm/iowa_gpm.csv', skiprows=8)
df.head()

Unnamed: 0,time,mean_GPM_3IMERGM_06_precipitation
0,2013-04-01 00:00:00,163.784744
1,2013-05-01 00:00:00,217.038086
2,2013-06-01 00:00:00,112.705574
3,2013-07-01 00:00:00,48.736252
4,2013-08-01 00:00:00,39.590923


In [99]:
# Filter data to growing season
validMonths = ['04', '05', '06', '07', '08', '09', '10', '11']
regex = "|".join(f"\-{month}\-" for month in validMonths)
df = df[df['time'].str.contains(regex)]

# Add year metadata and simplify precip column name
df['year'] = df['time'].str[:4]
df.rename(columns={' mean_GPM_3IMERGM_06_precipitation': 'precip'}, inplace=True)


It appears that this dateset is missing values beyond '2021-09-01'.

This is due to an upgrade currently occurring on the MODIS sattellite.

Luckily, we can simply impute the values of 2021-10-01 and 2021-11-01 with either mean-value imputation.

In [107]:

# Let's use meanvalue imputation for now. If results become messy, 
# we can instead utilize the Late Run data which should contain the missing data, albeit at a lower quality.
precip_october = df[df['time'].str.contains('-10-')]
precip_november = df[df['time'].str.contains('-11-')]

df_full = pd.concat([df, pd.DataFrame({
    'time': ['2021-10-01 00:00:00', '2021-11-01 00:00:00'], 
    'precip': [precip_october['precip'].mean(), precip_november['precip'].mean()], 
    'year': ['2021', '2021']
    })], ignore_index=True)

In [None]:
df_full = df_full.groupby('year')['precip'].apply(list).to_frame('precip').reset_index()

months = ['April', 'May', 'June', 'July', 'August', 'September', 'October', 'November']
split = pd.DataFrame(df_full['precip'].to_list(), columns=[f'{month}_precip' for month in months])
df_final = pd.concat([df_full, split], axis=1)

In [88]:
df_final.drop(columns='precip', inplace=True)

In [89]:
df_final.head()

Unnamed: 0,year,April_precip,May_precip,June_precip,July_precip,August_precip,September_precip,October_precip,November_precip
0,2013,163.784744,217.038086,112.705574,48.736252,39.590923,51.454273,81.460571,44.676483
1,2014,126.722443,72.005409,255.499573,69.04921,170.884445,99.224899,76.443123,21.251619
2,2015,78.606812,128.137024,166.513763,150.968811,126.89608,110.829582,48.176628,123.23291
3,2016,92.733719,102.606918,107.820251,138.8078,161.669571,158.565735,55.978916,46.940487
4,2017,100.84565,108.114075,96.086922,69.039795,99.764114,62.90258,138.481659,13.746578


In [93]:
df[df['year'].str.contains("2013|2014")]

Unnamed: 0,time,precip,year
0,2013-04-01 00:00:00,163.784744,2013
1,2013-05-01 00:00:00,217.038086,2013
2,2013-06-01 00:00:00,112.705574,2013
3,2013-07-01 00:00:00,48.736252,2013
4,2013-08-01 00:00:00,39.590923,2013
5,2013-09-01 00:00:00,51.454273,2013
6,2013-10-01 00:00:00,81.460571,2013
7,2013-11-01 00:00:00,44.676483,2013
12,2014-04-01 00:00:00,126.722443,2014
13,2014-05-01 00:00:00,72.005409,2014


Alright, it seems that the months align properly with our column labeling.

Let's put it all together and create one final precipitation dataset.

In [94]:
def parse_df(data):
    
    df = data.copy()
    
    validMonths = ['04', '05', '06', '07', '08', '09', '10', '11']
    months = ['April', 'May', 'June', 'July', 'August', 'September', 'October', 'November']
    
    regex = "|".join(f"\-{month}\-" for month in validMonths)
    df = df[df['time'].str.contains(regex)]

    # Add year metadata and simplify precip column name
    df['year'] = df['time'].str[:4]
    df.rename(columns={' mean_GPM_3IMERGM_06_precipitation': 'precip'}, inplace=True)

    # Let's use meanvalue imputation for now. If results become messy, 
    # we can instead utilize the Late Run data which should contain the missing data, albeit at a lower quality.
    precip_october = df[df['time'].str.contains('-10-')]
    precip_november = df[df['time'].str.contains('-11-')]

    df = pd.concat([df, pd.DataFrame({
        'time': ['2021-10-01 00:00:00', '2021-11-01 00:00:00'], 
        'precip': [precip_october['precip'].mean(), precip_november['precip'].mean()], 
        'year': ['2021', '2021']
        })], ignore_index=True)
    
    df = df.groupby('year')['precip'].apply(list).to_frame('precip').reset_index()

    split = pd.DataFrame(df['precip'].to_list(), columns=[f'precip_{month.lower()}' for month in months])
    df = pd.concat([df, split], axis=1)
    df.drop(columns='precip', inplace=True)

    return df

In [95]:
parse_df(df)

  df = df.append(pd.DataFrame({


Unnamed: 0,year,precip_april,precip_may,precip_june,precip_july,precip_august,precip_september,precip_october,precip_november
0,2013,163.784744,217.038086,112.705574,48.736252,39.590923,51.454273,81.460571,44.676483
1,2014,126.722443,72.005409,255.499573,69.04921,170.884445,99.224899,76.443123,21.251619
2,2015,78.606812,128.137024,166.513763,150.968811,126.89608,110.829582,48.176628,123.23291
3,2016,92.733719,102.606918,107.820251,138.8078,161.669571,158.565735,55.978916,46.940487
4,2017,100.84565,108.114075,96.086922,69.039795,99.764114,62.90258,138.481659,13.746578
5,2018,29.928348,115.669472,203.443878,72.116142,147.996704,185.802032,118.554222,51.983082
6,2019,85.282715,202.432373,112.899689,93.766853,97.762611,172.890442,116.120567,48.997845
7,2020,44.466381,114.195923,130.814133,88.415787,37.848484,106.867851,57.914566,60.707256
8,2021,44.71244,103.651543,96.828659,94.236015,131.151352,49.886787,86.641282,51.442033


Everything seems to be in order. The summer months are expected to have the most precipitation since most evaporation
occurs during that time.

Now, let's create the final precipitation dataset.

In [96]:
stateToDf = dict()
states = ["iowa", "indiana", 'iowa', 'kansas', 'minnesota', 'missouri', 'nebraska', 'ohio', 'south dakota', 'wisconsin']

for state in states:

    df = pd.read_csv(f'data/gpm/{state}_gpm.csv', skiprows=8)
    stateToDf[state] = parse_df(df)

  df = df.append(pd.DataFrame({
  df = df.append(pd.DataFrame({
  df = df.append(pd.DataFrame({
  df = df.append(pd.DataFrame({
  df = df.append(pd.DataFrame({
  df = df.append(pd.DataFrame({
  df = df.append(pd.DataFrame({
  df = df.append(pd.DataFrame({
  df = df.append(pd.DataFrame({
  df = df.append(pd.DataFrame({
