In [1]:
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style="darkgrid")
import urllib
import urllib.parse as urlp
import io
import warnings
warnings.filterwarnings("ignore")

%matplotlib inline

In [2]:
def get_time_series(start_date,end_date,latitude,longitude,variable):
    """
    Calls the data rods service to get a time series
    """
    base_url = "https://hydro1.gesdisc.eosdis.nasa.gov/daac-bin/access/timeseries.cgi"
    query_parameters = {
        "variable": variable,
        "type": "asc2",
        "location": f"GEOM:POINT({longitude}, {latitude})",
        "startDate": start_date,
        "endDate": end_date,
    }
    
    full_url = base_url+"?"+ \
         "&".join(["{}={}".format(key,urlp.quote(query_parameters[key])) for key in query_parameters])
    print(full_url)
    iteration = 0
    done = False
    while not done and iteration < 5:
        r=requests.get(full_url)
        if r.status_code == 200:
            done = True
        else:
            iteration +=1
    
    if not done:
        raise Exception(f"Error code {r.status_code} from url {full_url} : {r.text}")
    
    return r.text

In [3]:
def parse_time_series(ts_str):
    """
    Parses the response from data rods.
    """
    lines = ts_str.split("\n")
    parameters = {}
    for line in lines[2:11]:
        key,value = line.split("=")
        parameters[key] = value
    
    
    df = pd.read_table(io.StringIO(ts_str),sep="\t",
                       names=["time","data"],
                       header=10,parse_dates=["time"])
    return parameters, df


In [4]:
lat = 43.67
long = -79.54

# Wind Speed
var1 = "GLDAS2:GLDAS_NOAH025_3H_v2.1:Wind_f_inst"
# Pressure
var2 = "GLDAS2:GLDAS_NOAH025_3H_v2.1:Psurf_f_inst"
# Temperature
var3 = "GLDAS2:GLDAS_NOAH025_3H_v2.1:Qair_f_inst"

In [5]:
df_ts1 = parse_time_series(
        get_time_series(
            start_date="2000-01-01T00", 
            end_date="2025-06-27T23",
            latitude=lat,
            longitude=long,
            variable=var1
        )
    )

df_ts2 = parse_time_series(
        get_time_series(
            start_date="2000-01-01T00", 
            end_date="2025-06-27T23",
            latitude=lat,
            longitude=long,
            variable=var2
        )
    )

df_ts3 = parse_time_series(
        get_time_series(
            start_date="2000-01-01T00", 
            end_date="2025-06-27T23",
            latitude=lat,
            longitude=long,
            variable=var3
        )
    )

df1 = df_ts1[1].rename({'data': 'Surface Air Temp (K)'},axis='columns')
df2 = df_ts2[1].rename({'data': 'Surface Wind Speed (m/s)','time':'t2'},axis='columns')
df3 = df_ts3[1].rename({'data': 'Humidity (g/kg)','time':'t3'},axis='columns')

combinedDF = pd.concat([df1,df2,df3], axis=1, join="inner")
combinedDF.head()

https://hydro1.gesdisc.eosdis.nasa.gov/daac-bin/access/timeseries.cgi?variable=GLDAS2%3AGLDAS_NOAH025_3H_v2.1%3AWind_f_inst&type=asc2&location=GEOM%3APOINT%28-79.54%2C%2043.67%29&startDate=2000-01-01T00&endDate=2025-06-27T23
https://hydro1.gesdisc.eosdis.nasa.gov/daac-bin/access/timeseries.cgi?variable=GLDAS2%3AGLDAS_NOAH025_3H_v2.1%3APsurf_f_inst&type=asc2&location=GEOM%3APOINT%28-79.54%2C%2043.67%29&startDate=2000-01-01T00&endDate=2025-06-27T23
https://hydro1.gesdisc.eosdis.nasa.gov/daac-bin/access/timeseries.cgi?variable=GLDAS2%3AGLDAS_NOAH025_3H_v2.1%3AQair_f_inst&type=asc2&location=GEOM%3APOINT%28-79.54%2C%2043.67%29&startDate=2000-01-01T00&endDate=2025-06-27T23


Unnamed: 0,time,Surface Air Temp (K),t2,Surface Wind Speed (m/s),t3,Humidity (g/kg)
0,2000-01-01 03:00:00,5.30299,2000-01-01 03:00:00,100311.0,2000-01-01 03:00:00,0.003743
1,2000-01-01 06:00:00,4.00595,2000-01-01 06:00:00,100450.0,2000-01-01 06:00:00,0.003228
2,2000-01-01 09:00:00,5.6,2000-01-01 09:00:00,100373.0,2000-01-01 09:00:00,0.00321
3,2000-01-01 12:00:00,6.20713,2000-01-01 12:00:00,100242.0,2000-01-01 12:00:00,0.00331
4,2000-01-01 15:00:00,8.40192,2000-01-01 15:00:00,100213.0,2000-01-01 15:00:00,0.00369


In [6]:
combinedDF.drop(columns=['t2', 't3'], inplace=True)
combinedDF.head()

Unnamed: 0,time,Surface Air Temp (K),Surface Wind Speed (m/s),Humidity (g/kg)
0,2000-01-01 03:00:00,5.30299,100311.0,0.003743
1,2000-01-01 06:00:00,4.00595,100450.0,0.003228
2,2000-01-01 09:00:00,5.6,100373.0,0.00321
3,2000-01-01 12:00:00,6.20713,100242.0,0.00331
4,2000-01-01 15:00:00,8.40192,100213.0,0.00369


In [7]:
#split the date column into year, month, day, hour
combinedDF['Year'] = pd.to_datetime(combinedDF['time']).dt.year
combinedDF['Month'] = pd.to_datetime(combinedDF['time']).dt.month
combinedDF['Day'] = pd.to_datetime(combinedDF['time']).dt.day
combinedDF['Hour'] = pd.to_datetime(combinedDF['time']).dt.hour

combinedDF.drop(columns=['time'], inplace=True)
combinedDF.head()

Unnamed: 0,Surface Air Temp (K),Surface Wind Speed (m/s),Humidity (g/kg),Year,Month,Day,Hour
0,5.30299,100311.0,0.003743,2000,1,1,3
1,4.00595,100450.0,0.003228,2000,1,1,6
2,5.6,100373.0,0.00321,2000,1,1,9
3,6.20713,100242.0,0.00331,2000,1,1,12
4,8.40192,100213.0,0.00369,2000,1,1,15


In [8]:
#add a column of constant values 
combinedDF['Longitude'] = long
combinedDF['Latitude'] = lat
combinedDF.head()

Unnamed: 0,Surface Air Temp (K),Surface Wind Speed (m/s),Humidity (g/kg),Year,Month,Day,Hour,Longitude,Latitude
0,5.30299,100311.0,0.003743,2000,1,1,3,-79.54,43.67
1,4.00595,100450.0,0.003228,2000,1,1,6,-79.54,43.67
2,5.6,100373.0,0.00321,2000,1,1,9,-79.54,43.67
3,6.20713,100242.0,0.00331,2000,1,1,12,-79.54,43.67
4,8.40192,100213.0,0.00369,2000,1,1,15,-79.54,43.67


In [9]:
combinedDF.isnull().sum()

Surface Air Temp (K)        0
Surface Wind Speed (m/s)    0
Humidity (g/kg)             0
Year                        0
Month                       0
Day                         0
Hour                        0
Longitude                   0
Latitude                    0
dtype: int64

## Linear Regression

In [10]:
X = combinedDF.drop('Surface Wind Speed (m/s)', axis=1)  
y = combinedDF['Surface Wind Speed (m/s)'].values

In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size =0.3 , random_state = 89)

In [12]:
from sklearn.linear_model import LinearRegression

linear_model = LinearRegression() 
linear_model.fit(X_train, y_train)

In [13]:
predictions = linear_model.predict(X_test)

linear_model.score(X_test, predictions)

print(linear_model.score(X_test, y_test))

0.21684239618466328


In [14]:
monthly_avg_temp = combinedDF.groupby('Month')['Surface Air Temp (K)'].mean()
monthly_avg_humidity = combinedDF.groupby('Month')['Humidity (g/kg)'].mean()

print(monthly_avg_humidity)

Month
1     0.002530
2     0.002533
3     0.003324
4     0.004628
5     0.007107
6     0.009991
7     0.011687
8     0.011424
9     0.009347
10    0.006429
11    0.004385
12    0.003142
Name: Humidity (g/kg), dtype: float64


In [15]:
import pickle 
with open('monthly_avg_humidity.pkl', 'wb') as f:
    pickle.dump(monthly_avg_humidity, f)
    f.close()
with open('monthly_avg_wind_speed.pkl', 'wb') as f:
    pickle.dump(monthly_avg_temp, f)
    f.close()

In [16]:
# create a sample prediction for our matrix

sample = {'Surface Air Temp (K)': [monthly_avg_temp[6]],
          'Humidity (g/kg)': [monthly_avg_humidity[6]],
          'Year': [2023],
          'Month': [6],
          'Day': [15],
          'Hour': [12],
          'Longitude': [-79.59],
          'Latitude': [43.80],
         }

sample_df = pd.DataFrame(sample)

linear_model.predict(sample_df)

array([99785.91155499])

In [18]:
# print significance of feature of linear model
print(linear_model.coef_)

[-1.38282390e+02 -7.10916410e+04  8.99229410e+00  1.96826825e+01
 -6.20111282e-01  4.83694752e+00]


In [52]:
combinedDF['Surface Wind Speed (m/s)'].describe()

count     74479.000000
mean      99888.481295
std         783.682160
min       96141.400000
25%       99404.250000
50%       99900.100000
75%      100397.000000
max      102932.000000
Name: Surface Wind Speed (m/s), dtype: float64

In [54]:
threshold = 100000  
heavy_temp_events = combinedDF[combinedDF['Surface Wind Speed (m/s)'] > threshold]
print(f"Number of High Winds: {len(heavy_temp_events)}")
print("number of total events:", len(combinedDF))

Number of High Winds: 33169
number of total events: 74479


In [17]:
# pickle the model 
import pickle
with open('wind_model.pkl', 'wb') as f:
    pickle.dump(linear_model, f)
    f.close()