In [389]:
import pandas as pd


In [390]:
file_path = "/Users/emiliodulay/Documents/Programming/Math-42-Final-Project/GLHYD_data_metric.csv"
df = pd.read_csv(file_path)
df.shape

(1284, 7)

# Formatting

In [391]:
df.head(15)


Unnamed: 0,# Coordinated Monthly Mean Lakewide Average Water Levels,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6
0,# Period of record: 1918-2023,,,,,,
1,"# Units: meters, IGLD 1985",,,,,,
2,# Calculated using the coordinated gage networ...,,,,,,
3,"# Superior: Marquette and Point Iroquois, MI; ...",,,,,,
4,"# Michigan-Huron: Harbor Beach, Mackinaw City ...",,,,,,
5,"# St. Clair: St. Clair Shores, MI and Belle Ri...",,,,,,
6,"# Erie: Toledo and Cleveland, OH; Port Stanley...",,,,,,
7,"# Ontario: Oswego and Rochester, NY; Cobourg, ...",,,,,,
8,#,,,,,,
9,# Last modified March 2024 Contact: Deanna.C.F...,,,,,,


In [392]:
notes = []
for i in range(0,10):
    notes.append(df.iloc[i,0])
    
notes


['# Period of record: 1918-2023',
 '# Units: meters, IGLD 1985',
 '# Calculated using the coordinated gage network, consisting of:',
 '# Superior: Marquette and Point Iroquois, MI; Duluth, MN; Michipicoten and Thunder Bay, Ontario',
 '# Michigan-Huron: Harbor Beach, Mackinaw City and Ludington, MI; Milwaukee, WI; Thessalon and Tobermory, Ontario',
 '# St. Clair: St. Clair Shores, MI and Belle River, Ontario',
 '# Erie: Toledo and Cleveland, OH; Port Stanley and Port Colborne, Ontario',
 '# Ontario: Oswego and Rochester, NY; Cobourg, Port Weller, Toronto, and Kingston, Ontario',
 '#',
 '# Last modified March 2024 Contact: Deanna.C.Fielder@usace.army.mil']

In [393]:
df.columns = df.iloc[11,:] # Rename columns to month, year and lake
df = df.iloc[12:,:].reset_index()
df = df.drop('index', axis = 1)
df.columns.name = 'index' # rename index column to index

In [394]:
df['month'] = df['month'].map({
                'jan' : 1,
                'feb' : 2,
                'mar' : 3,
                'apr' : 4,
                'may' : 5,
                'jun' : 6,
                'jul' : 7,
                'aug' : 8,
                'sep' : 9,
                'oct' : 10,
                'nov' : 11,
                'dec' : 12
                })

df.head()

index,month,year,Superior,Michigan-Huron,St. Clair,Erie,Ontario
0,1,1918,183.25,176.71,174.59,173.9,74.74
1,2,1918,183.2,176.73,174.74,173.82,74.72
2,3,1918,183.17,176.8,174.74,174.01,74.92
3,4,1918,183.14,176.89,174.84,174.02,75.1
4,5,1918,183.22,176.99,175.0,173.98,75.09


In [395]:
# Makes Lake a column 
df_melt = pd.melt(frame = df, 
                  id_vars = ["month", "year"], 
                  value_vars = ["Superior", "Michigan-Huron", "St. Clair", "Erie", "Ontario"],
                  var_name = "Lake",
                  value_name = "Water Level"
                  )

In [396]:
df_melt.shape

(6360, 4)

In [397]:
df.tail(5)

index,month,year,Superior,Michigan-Huron,St. Clair,Erie,Ontario
1267,8,2023,183.67,176.7,175.52,174.59,75.06
1268,9,2023,183.63,176.64,175.44,174.55,74.88
1269,10,2023,183.55,176.56,175.28,174.39,74.66
1270,11,2023,183.46,176.51,175.17,174.29,74.49
1271,12,2023,183.37,176.44,175.13,174.26,74.49


# Data Cleaning

In [398]:
df_melt['year'] = pd.to_numeric(df_melt['year'], errors = "raise")
df_melt['Water Level'] = pd.to_numeric(df_melt['Water Level'], errors = "raise")
df_melt.dtypes

month            int64
year             int64
Lake            object
Water Level    float64
dtype: object

In [399]:
df_melt.isna().sum()

month          0
year           0
Lake           0
Water Level    0
dtype: int64

In [400]:
df_melt.isnull().sum()

month          0
year           0
Lake           0
Water Level    0
dtype: int64

# Exploratory Data Analysis

In [421]:
# Average Water Level for every Lake for the entire time period.
mean_water_level_total = df_melt.groupby("Lake")["Water Level"].mean() 

# Average Water Level for every Lake for every month for the entire time period.
mean_water_level_month = df_melt.groupby(["Lake", "month"]).mean().reset_index().drop(columns = "year")
lakes_mean = []


for i in df_melt["Lake"].unique():
    i = mean_water_level_month[mean_water_level_month["Lake"] == i].reset_index().drop(columns = "index")
    lakes_mean.append(i)

isinstance(lakes_mean[0], pd.DataFrame) # A list of DataFrames. Each entry contains the mean water level per month for each lake over the entire period.

mean_water_level_total


Lake
Erie              174.176792
Michigan-Huron    176.450362
Ontario            74.769214
St. Clair         175.043569
Superior          183.414756
Name: Water Level, dtype: float64

In [420]:
# Creates a new data frame called "Lakes Summary Stats" which has the summary stats for every lake for all time
summary_stats = ['mean', 'std', 'median', 'min', 'max']
lakes_summary_stats = pd.DataFrame()

for i in range(0, len(lakes_mean)):    
    temp = lakes_mean[i].agg({
            'Water Level' : summary_stats
        })
    name = df_melt["Lake"].unique()[i]
    lakes_summary_stats[name] = temp
    

lakes_summary_stats.index.name = "water level"
lakes_summary_stats['Superior']

water level
mean      183.414756
std         0.112345
median    183.434387
min       183.242547
max       183.545660
Name: Superior, dtype: float64

In [417]:
lakes_summary_stats_2 = df_melt.groupby(['Lake', 'month']).agg({'Water Level' : summary_stats}).reset_index()
lakes_summary_stats_2

Unnamed: 0_level_0,Lake,month,Water Level,Water Level,Water Level,Water Level,Water Level
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,mean,std,median,min,max
0,Erie,1,174.030189,0.351895,173.995,173.21,174.86
1,Erie,2,174.024906,0.367471,174.01,173.18,174.9
2,Erie,3,174.111321,0.383746,174.11,173.2,174.95
3,Erie,4,174.255283,0.363494,174.25,173.38,175.05
4,Erie,5,174.336792,0.347515,174.355,173.44,175.08
5,Erie,6,174.369434,0.343611,174.4,173.45,175.14
6,Erie,7,174.35434,0.339751,174.395,173.45,175.13
7,Erie,8,174.289528,0.328855,174.315,173.43,175.02
8,Erie,9,174.198962,0.323313,174.235,173.38,174.87
9,Erie,10,174.098868,0.323738,174.08,173.3,174.94


# Our Model