In [389]:
import pandas as pd


In [390]:
file_path = "/Users/emiliodulay/Documents/Programming/Math-42-Final-Project/GLHYD_data_metric.csv"
df = pd.read_csv(file_path)
df.shape

(1284, 7)

# Formatting

In [391]:
df.head(15)


Unnamed: 0,# Coordinated Monthly Mean Lakewide Average Water Levels,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6
0,# Period of record: 1918-2023,,,,,,
1,"# Units: meters, IGLD 1985",,,,,,
2,# Calculated using the coordinated gage networ...,,,,,,
3,"# Superior: Marquette and Point Iroquois, MI; ...",,,,,,
4,"# Michigan-Huron: Harbor Beach, Mackinaw City ...",,,,,,
5,"# St. Clair: St. Clair Shores, MI and Belle Ri...",,,,,,
6,"# Erie: Toledo and Cleveland, OH; Port Stanley...",,,,,,
7,"# Ontario: Oswego and Rochester, NY; Cobourg, ...",,,,,,
8,#,,,,,,
9,# Last modified March 2024 Contact: Deanna.C.F...,,,,,,


In [392]:
notes = []
for i in range(0,10):
    notes.append(df.iloc[i,0])
    
notes


['# Period of record: 1918-2023',
 '# Units: meters, IGLD 1985',
 '# Calculated using the coordinated gage network, consisting of:',
 '# Superior: Marquette and Point Iroquois, MI; Duluth, MN; Michipicoten and Thunder Bay, Ontario',
 '# Michigan-Huron: Harbor Beach, Mackinaw City and Ludington, MI; Milwaukee, WI; Thessalon and Tobermory, Ontario',
 '# St. Clair: St. Clair Shores, MI and Belle River, Ontario',
 '# Erie: Toledo and Cleveland, OH; Port Stanley and Port Colborne, Ontario',
 '# Ontario: Oswego and Rochester, NY; Cobourg, Port Weller, Toronto, and Kingston, Ontario',
 '#',
 '# Last modified March 2024 Contact: Deanna.C.Fielder@usace.army.mil']

In [393]:
df.columns = df.iloc[11,:] # Rename columns to month, year and lake
df = df.iloc[12:,:].reset_index()
df = df.drop('index', axis = 1)
df.columns.name = 'index' # rename index column to index

In [394]:
df['month'] = df['month'].map({
                'jan' : 1,
                'feb' : 2,
                'mar' : 3,
                'apr' : 4,
                'may' : 5,
                'jun' : 6,
                'jul' : 7,
                'aug' : 8,
                'sep' : 9,
                'oct' : 10,
                'nov' : 11,
                'dec' : 12
                })

df.head()

index,month,year,Superior,Michigan-Huron,St. Clair,Erie,Ontario
0,1,1918,183.25,176.71,174.59,173.9,74.74
1,2,1918,183.2,176.73,174.74,173.82,74.72
2,3,1918,183.17,176.8,174.74,174.01,74.92
3,4,1918,183.14,176.89,174.84,174.02,75.1
4,5,1918,183.22,176.99,175.0,173.98,75.09


In [395]:
# Makes Lake a column 
df_melt = pd.melt(frame = df, 
                  id_vars = ["month", "year"], 
                  value_vars = ["Superior", "Michigan-Huron", "St. Clair", "Erie", "Ontario"],
                  var_name = "Lake",
                  value_name = "Water Level"
                  )

In [396]:
df_melt.shape

(6360, 4)

In [397]:
df.tail(5)

index,month,year,Superior,Michigan-Huron,St. Clair,Erie,Ontario
1267,8,2023,183.67,176.7,175.52,174.59,75.06
1268,9,2023,183.63,176.64,175.44,174.55,74.88
1269,10,2023,183.55,176.56,175.28,174.39,74.66
1270,11,2023,183.46,176.51,175.17,174.29,74.49
1271,12,2023,183.37,176.44,175.13,174.26,74.49


# Data Cleaning

In [398]:
df_melt['year'] = pd.to_numeric(df_melt['year'], errors = "raise")
df_melt['Water Level'] = pd.to_numeric(df_melt['Water Level'], errors = "raise")
df_melt.dtypes

month            int64
year             int64
Lake            object
Water Level    float64
dtype: object

In [399]:
df_melt.isna().sum()

month          0
year           0
Lake           0
Water Level    0
dtype: int64

In [400]:
df_melt.isnull().sum()

month          0
year           0
Lake           0
Water Level    0
dtype: int64

# Exploratory Data Analysis

In [401]:
# Average Water Level for every Lake for the entire time period.
mean_water_level_total = df_melt.groupby("Lake")["Water Level"].mean() 

# Average Water Level for every Lake for every month for the entire time period.
mean_water_level_month = df_melt.groupby(["Lake", "month"]).mean().reset_index().drop(columns = "year")
lakes_mean = []


for i in df_melt["Lake"].unique():
    i = mean_water_level_month[mean_water_level_month["Lake"] == i].reset_index().drop(columns = "index")
    lakes_mean.append(i)

isinstance(lakes_mean[0], pd.DataFrame) # A list of DataFrames. Each entry contains the mean water level per month for each lake over the entire period.

lakes_mean[0]


Unnamed: 0,Lake,month,Water Level
0,Superior,1,183.337642
1,Superior,2,183.276887
2,Superior,3,183.242547
3,Superior,4,183.271132
4,Superior,5,183.37283
5,Superior,6,183.456981
6,Superior,7,183.520094
7,Superior,8,183.544717
8,Superior,9,183.54566
9,Superior,10,183.52


In [None]:
# Creates a new data frame called "Lakes Summary Stats" which has the summary stats for every lake for all time
summary_stats = ['mean', 'std', 'median', 'min', 'max']
lakes_summary_stats = pd.DataFrame()

for i in range(0, len(lakes_mean)):    
    temp = lakes_mean[i].agg({
            'Water Level' : summary_stats
        })
    name = df_melt["Lake"].unique()[i]
    lakes_summary_stats[name] = temp
    

lakes_summary_stats.index.name = "water level"
lakes_summary_stats

Unnamed: 0_level_0,Superior,Michigan-Huron,St. Clair,Erie,Ontario
water level,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
mean,183.414756,176.450362,175.043569,174.176792,74.769214
std,0.112345,0.108566,0.138796,0.139496,0.200786
median,183.434387,176.437972,175.051415,174.155142,74.719623
min,183.242547,176.308585,174.821321,174.022547,74.534057
max,183.54566,176.603585,175.227547,174.369434,75.065


# Our Model