In [121]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

## Import Data, Convert into dataframe

In [122]:
air_pressure = pd.read_csv("43_-80.AirPressure.csv")
humidity = pd.read_csv("43_-80.Humidity.csv")
precipitation = pd.read_csv("43_-80.Precipitation.csv")


In [123]:
#convert humidity into dataframe
humidity = pd.DataFrame(humidity)
precipitation = pd.DataFrame(precipitation)
air_pressure = pd.DataFrame(air_pressure)


In [124]:
humidity.head()

Unnamed: 0,Date2,Humidity
0,2000-01-01T08:00:00Z,0.003553
1,2000-01-01T11:00:00Z,0.002988
2,2000-01-01T14:00:00Z,0.00297
3,2000-01-01T17:00:00Z,0.00306
4,2000-01-01T20:00:00Z,0.00354


### Combine different Raw Data Sets

In [125]:
combinedDF = pd.concat([humidity, precipitation, air_pressure], axis=1, join='inner')
combinedDF.head()

Unnamed: 0,Date2,Humidity,Date,Precipitation Rate,Date1,Pressure (Pa)
0,2000-01-01T08:00:00Z,0.003553,2000-01-01T08:00:00Z,4e-07,2000-01-01T08:00:00Z,98992.2
1,2000-01-01T11:00:00Z,0.002988,2000-01-01T11:00:00Z,0.0,2000-01-01T11:00:00Z,99125.5
2,2000-01-01T14:00:00Z,0.00297,2000-01-01T14:00:00Z,0.0,2000-01-01T14:00:00Z,99039.4
3,2000-01-01T17:00:00Z,0.00306,2000-01-01T17:00:00Z,0.0,2000-01-01T17:00:00Z,98901.5
4,2000-01-01T20:00:00Z,0.00354,2000-01-01T20:00:00Z,0.0,2000-01-01T20:00:00Z,98886.6


### Converting Date into MM:DD::YYYY:HH

In [126]:
# drop columns Date 
combinedDF.drop(columns=['Date1', 'Date2'], inplace=True)
combinedDF.head()

Unnamed: 0,Humidity,Date,Precipitation Rate,Pressure (Pa)
0,0.003553,2000-01-01T08:00:00Z,4e-07,98992.2
1,0.002988,2000-01-01T11:00:00Z,0.0,99125.5
2,0.00297,2000-01-01T14:00:00Z,0.0,99039.4
3,0.00306,2000-01-01T17:00:00Z,0.0,98901.5
4,0.00354,2000-01-01T20:00:00Z,0.0,98886.6


In [127]:
#split the date column into year, month, day, hour
combinedDF['Year'] = pd.to_datetime(combinedDF['Date']).dt.year
combinedDF['Month'] = pd.to_datetime(combinedDF['Date']).dt.month
combinedDF['Day'] = pd.to_datetime(combinedDF['Date']).dt.day
combinedDF['Hour'] = pd.to_datetime(combinedDF['Date']).dt.hour

In [128]:
combinedDF.drop(columns=['Date'], inplace=True)
combinedDF.head()

Unnamed: 0,Humidity,Precipitation Rate,Pressure (Pa),Year,Month,Day,Hour
0,0.003553,4e-07,98992.2,2000,1,1,8
1,0.002988,0.0,99125.5,2000,1,1,11
2,0.00297,0.0,99039.4,2000,1,1,14
3,0.00306,0.0,98901.5,2000,1,1,17
4,0.00354,0.0,98886.6,2000,1,1,20


In [129]:
combinedDF.isnull().sum()

Humidity              0
Precipitation Rate    0
Pressure (Pa)         0
Year                  0
Month                 0
Day                   0
Hour                  0
dtype: int64

## Adding Latitude and Longitude

In [130]:
#add a column of constant values 
combinedDF['Longitude'] = -79.59
combinedDF['Latitude'] = 43.80
combinedDF.head()

Unnamed: 0,Humidity,Precipitation Rate,Pressure (Pa),Year,Month,Day,Hour,Longitude,Latitude
0,0.003553,4e-07,98992.2,2000,1,1,8,-79.59,43.8
1,0.002988,0.0,99125.5,2000,1,1,11,-79.59,43.8
2,0.00297,0.0,99039.4,2000,1,1,14,-79.59,43.8
3,0.00306,0.0,98901.5,2000,1,1,17,-79.59,43.8
4,0.00354,0.0,98886.6,2000,1,1,20,-79.59,43.8


## Creating the Average Humidity/Pressure Data

In [131]:
# for each month, calculate the average humidity and air pressure
monthly_avg_humidity = combinedDF.groupby('Month')['Humidity'].mean()
monthly_avg_air_pressure = combinedDF.groupby('Month')['Pressure (Pa)'].mean()

print(monthly_avg_humidity)




Month
1     0.002437
2     0.002386
3     0.003198
4     0.004535
5     0.007079
6     0.009965
7     0.011614
8     0.011230
9     0.009147
10    0.006277
11    0.004308
12    0.002992
Name: Humidity, dtype: float64


### Create Linear Regression Model

In [132]:
X = combinedDF.drop('Precipitation Rate', axis=1)  
y = combinedDF['Precipitation Rate'].values

In [133]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size =0.3 , random_state = 89)

In [134]:
from sklearn.linear_model import LinearRegression

linear_model = LinearRegression() 
linear_model.fit(X_train, y_train)

In [135]:
predictions = linear_model.predict(X_test)

linear_model.score(X_test, predictions)

print(linear_model.score(X_test, y_test))

0.02683084201274566


In [136]:
# print significance of feature of linear model
print(linear_model.coef_)

[ 2.32319849e-03 -1.25499142e-08  2.89688824e-07  7.25084970e-07
  9.47272805e-08 -2.37014646e-07 -1.89599769e-35 -9.47998847e-36]


In [138]:
# create a sample prediction for our matrix

sample = {'Humidity': [monthly_avg_humidity[6]],
          'Pressure (Pa)': [monthly_avg_air_pressure[6]],
          'Year': [2023],
          'Month': [6],
          'Day': [15],
          'Hour': [12],
          'Longitude': [-79.59],
          'Latitude': [43.80],
         }

sample_df = pd.DataFrame(sample)

linear_model.predict(sample_df)


array([3.96472653e-05])

In [140]:
# display number of precipitation events above a certain threshold
threshold = 3.96e-05  # define threshold for heavy precipitation
heavy_precip_events = combinedDF[combinedDF['Precipitation Rate'] > threshold]
print(f"Number of heavy precipitation events: {len(heavy_precip_events)}")
print("number of total events:", len(combinedDF))

Number of heavy precipitation events: 10733
number of total events: 74486


In [141]:
# pickle the model 
import pickle
with open('rain_model.pkl', 'wb') as f:
    pickle.dump(linear_model, f)
    f.close()
    

In [None]:
file = open('rain_model.pkl', 'rb')

linear_model = pickle.load(file)

file.close()

data.predict(sample_df)