In [1]:
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style="darkgrid")
import urllib
import urllib.parse as urlp
import io
import warnings
warnings.filterwarnings("ignore")

%matplotlib inline

def get_time_series(start_date,end_date,latitude,longitude,variable):
    """
    Calls the data rods service to get a time series
    """
    base_url = "https://hydro1.gesdisc.eosdis.nasa.gov/daac-bin/access/timeseries.cgi"
    query_parameters = {
        "variable": variable,
        "type": "asc2",
        "location": f"GEOM:POINT({longitude}, {latitude})",
        "startDate": start_date,
        "endDate": end_date,
    }
    full_url = base_url+"?"+ \
         "&".join(["{}={}".format(key,urlp.quote(query_parameters[key])) for key in query_parameters])
    print(full_url)
    iteration = 0
    done = False
    while not done and iteration < 5:
        r=requests.get(full_url)
        if r.status_code == 200:
            done = True
        else:
            iteration +=1
    
    if not done:
        raise Exception(f"Error code {r.status_code} from url {full_url} : {r.text}")
    
    return r.text

def parse_time_series(ts_str):
    """
    Parses the response from data rods.
    """
    lines = ts_str.split("\n")
    parameters = {}
    for line in lines[2:11]:
        key,value = line.split("=")
        parameters[key] = value
    
    
    df = pd.read_table(io.StringIO(ts_str),sep="\t",
                       names=["time","data"],
                       header=10,parse_dates=["time"])
    return parameters, df



In [2]:

df_ts = parse_time_series(
        get_time_series(
            start_date="2012-06-01T00", 
            end_date="2022-05-31T23",
            latitude=43.67,
            longitude=-79.54,
            variable="GLDAS2:GLDAS_NOAH025_3H_v2.1:Rainf_f_tavg"
        )
    )
#43.674901654747046, -79.53730583391066
df_ts[1]

df_ts1 = parse_time_series(
        get_time_series(
            start_date="2000-01-01T00", 
            end_date="2025-06-27T23",
            latitude=43.67,
            longitude=-79.54,
            variable="GLDAS2:GLDAS_NOAH025_3H_v2.1:Rainf_tavg"
        )
    )

df_ts2 = parse_time_series(
        get_time_series(
            start_date="2000-01-01T00", 
            end_date="2025-06-27T23",
            latitude=43.67,
            longitude=-79.54,
            variable="GLDAS2:GLDAS_NOAH025_3H_v2.1:Qair_f_inst"
        )
    )

df_ts3 = parse_time_series(
        get_time_series(
            start_date="2000-01-01T00", 
            end_date="2025-06-27T23",
            latitude=43.67,
            longitude=-79.54,
            variable="GLDAS2:GLDAS_NOAH025_3H_v2.1:Psurf_f_inst"
        )
    )

df1 = df_ts1[1].rename({'data': 'rain average'},axis='columns')
df2 = df_ts2[1].rename({'data': 'Humidity','time':'t2'},axis='columns')
df3 = df_ts3[1].rename({'data': 'Pressure (Pa)','time':'t3'},axis='columns')

combinedDF = pd.concat([df1,df2,df3], axis=1, join="inner")


https://hydro1.gesdisc.eosdis.nasa.gov/daac-bin/access/timeseries.cgi?variable=GLDAS2%3AGLDAS_NOAH025_3H_v2.1%3ARainf_f_tavg&type=asc2&location=GEOM%3APOINT%28-79.54%2C%2043.67%29&startDate=2012-06-01T00&endDate=2022-05-31T23
https://hydro1.gesdisc.eosdis.nasa.gov/daac-bin/access/timeseries.cgi?variable=GLDAS2%3AGLDAS_NOAH025_3H_v2.1%3ARainf_tavg&type=asc2&location=GEOM%3APOINT%28-79.54%2C%2043.67%29&startDate=2000-01-01T00&endDate=2025-06-27T23
https://hydro1.gesdisc.eosdis.nasa.gov/daac-bin/access/timeseries.cgi?variable=GLDAS2%3AGLDAS_NOAH025_3H_v2.1%3AQair_f_inst&type=asc2&location=GEOM%3APOINT%28-79.54%2C%2043.67%29&startDate=2000-01-01T00&endDate=2025-06-27T23
https://hydro1.gesdisc.eosdis.nasa.gov/daac-bin/access/timeseries.cgi?variable=GLDAS2%3AGLDAS_NOAH025_3H_v2.1%3APsurf_f_inst&type=asc2&location=GEOM%3APOINT%28-79.54%2C%2043.67%29&startDate=2000-01-01T00&endDate=2025-06-27T23


## Import Data, Convert into dataframe

### Combine different Raw Data Sets

### Converting Date into MM:DD::YYYY:HH

In [3]:
# drop columns Date 
combinedDF.drop(columns=['t2', 't3'], inplace=True)
combinedDF.head()

Unnamed: 0,time,rain average,Humidity,Pressure (Pa)
0,2000-01-01 03:00:00,3e-06,0.003743,100311.0
1,2000-01-01 06:00:00,1e-06,0.003228,100450.0
2,2000-01-01 09:00:00,0.0,0.00321,100373.0
3,2000-01-01 12:00:00,0.0,0.00331,100242.0
4,2000-01-01 15:00:00,2e-06,0.00369,100213.0


In [4]:
#split the date column into year, month, day, hour
combinedDF['Year'] = pd.to_datetime(combinedDF['time']).dt.year
combinedDF['Month'] = pd.to_datetime(combinedDF['time']).dt.month
combinedDF['Day'] = pd.to_datetime(combinedDF['time']).dt.day
combinedDF['Hour'] = pd.to_datetime(combinedDF['time']).dt.hour

In [5]:
combinedDF.drop(columns=['time'], inplace=True)
combinedDF.head()

Unnamed: 0,rain average,Humidity,Pressure (Pa),Year,Month,Day,Hour
0,3e-06,0.003743,100311.0,2000,1,1,3
1,1e-06,0.003228,100450.0,2000,1,1,6
2,0.0,0.00321,100373.0,2000,1,1,9
3,0.0,0.00331,100242.0,2000,1,1,12
4,2e-06,0.00369,100213.0,2000,1,1,15


In [6]:
combinedDF.isnull().sum()

rain average     0
Humidity         0
Pressure (Pa)    0
Year             0
Month            0
Day              0
Hour             0
dtype: int64

## Adding Latitude and Longitude

In [7]:
#add a column of constant values 
combinedDF['Longitude'] = -79.59
combinedDF['Latitude'] = 43.80
combinedDF.head()

Unnamed: 0,rain average,Humidity,Pressure (Pa),Year,Month,Day,Hour,Longitude,Latitude
0,3e-06,0.003743,100311.0,2000,1,1,3,-79.59,43.8
1,1e-06,0.003228,100450.0,2000,1,1,6,-79.59,43.8
2,0.0,0.00321,100373.0,2000,1,1,9,-79.59,43.8
3,0.0,0.00331,100242.0,2000,1,1,12,-79.59,43.8
4,2e-06,0.00369,100213.0,2000,1,1,15,-79.59,43.8


## Creating the Average Humidity/Pressure Data

In [8]:
# for each month, calculate the average humidity and air pressure
monthly_avg_humidity = combinedDF.groupby('Month')['Humidity'].mean()
monthly_avg_air_pressure = combinedDF.groupby('Month')['Pressure (Pa)'].mean()

print(monthly_avg_humidity)




Month
1     0.002530
2     0.002533
3     0.003324
4     0.004628
5     0.007107
6     0.009991
7     0.011687
8     0.011424
9     0.009347
10    0.006429
11    0.004385
12    0.003142
Name: Humidity, dtype: float64


In [9]:
import pickle 

with open('avg_humidity.pkl', 'wb') as f:
    pickle.dump(monthly_avg_humidity, f)
    f.close()

with open('avg_air_pressure.pkl', 'wb') as f:
    pickle.dump(monthly_avg_air_pressure, f)
    f.close()

### Create Linear Regression Model

In [10]:
X = combinedDF.drop('rain average', axis=1)  
y = combinedDF['rain average'].values

In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size =0.3 , random_state = 89)

In [12]:
from sklearn.linear_model import LinearRegression

linear_model = LinearRegression() 
linear_model.fit(X_train, y_train)

In [13]:
predictions = linear_model.predict(X_test)

linear_model.score(X_test, predictions)

print(linear_model.score(X_test, y_test))

0.028543713404361792


In [14]:
# print significance of feature of linear model
print(linear_model.coef_)

[ 2.32375515e-03 -1.23581611e-08  3.56577790e-07  6.16966608e-07
  1.04949709e-07 -1.03223854e-07  1.38184847e-34  6.90924236e-35]


In [15]:
# create a sample prediction for our matrix

sample = {'Humidity': [monthly_avg_humidity[6]],
          'Pressure (Pa)': [monthly_avg_air_pressure[6]],
          'Year': [2023],
          'Month': [6],
          'Day': [15],
          'Hour': [12],
          'Longitude': [-79.59],
          'Latitude': [43.80],
         }

sample_df = pd.DataFrame(sample)

linear_model.predict(sample_df)


array([4.23871793e-05])

In [16]:
# display number of precipitation events above a certain threshold
threshold = 3.96e-05  # define threshold for heavy precipitation
heavy_precip_events = combinedDF[combinedDF['rain average'] > threshold]
print(f"Number of heavy precipitation events: {len(heavy_precip_events)}")
print("number of total events:", len(combinedDF))

Number of heavy precipitation events: 11259
number of total events: 74479


In [18]:
# pickle the model 
import pickle
with open('rain_model.pkl', 'wb') as f:
    pickle.dump(linear_model, f)
    f.close()
    

In [37]:
file = open('rain_model.pkl', 'rb')

linear_model = pickle.load(file)

file.close()

linear_model.predict(sample_df)

array([4.23871793e-05])