In [None]:
import pandas as pd
import hvplot.pandas
from pathlib import Path
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

In [None]:
#Connect to the database
conn_string = "host='localhost' dbname='glacier_data' user='postgres' password='REDACTED'"
#password has been removed for security reasons
conn = psycopg2.connect(conn_string)
print("Database opened successfully")
cursor = conn.cursor();
query = 'select * from yourdatabase'
cursor.execute(query)
colnames = [desc[0] for desc in cursor.description]
df = pd.DataFrame(cursor.fetchall(), columns=colnames)
cursor.close()
print('Database closed successfully!')

In [None]:
raw_df = pd.read_csv(Path("../Data/SecUMLprep.csv"))
raw_df.head()

In [None]:
raw_df = raw_df.dropna()
# Use the StandardScaler module and fit_transform function to 
# scale all columns with numerical values
raw_df_scaled = StandardScaler().fit_transform(raw_df[['Lat','Lon','Monthly Average Mean Temperature (degF)', 'Monthly Total Precipitation (in)', 'Monthly Total Snowfall (in)', 'Area (mi^2)_1986', 'Area (mi^2)_2020', 'Area Shrinkage (mi^2)', 'Height (ft)', 'Length (ft)']])

# Diplay the first three rows of the scaled data
raw_df_scaled[0:3]

In [None]:
#computing intertia for all k values for use in elbow graphing
inertia = []
k = list(range(1, 11))
for i in k:
    k_model = KMeans(n_clusters=i, random_state=1)
    k_model.fit(raw_df_scaled)
    inertia.append(k_model.inertia_)
elbow_data = {"k": k, "inertia": inertia}
elbow_df = pd.DataFrame(elbow_data)
elbow_df.head()

In [None]:
#plotting the elbow curve
elbow_df.hvplot.line(
    x="k", 
    y="inertia",
    title="Elbow Curve",
    xticks=k
)

In [None]:
# Define the model with the lower value of k clusters
# Use a random_state of 1 to generate the model
model = KMeans(n_clusters=4, random_state=1)
# Fit the model
model.fit(raw_df)
# Make predictions
predict = model.predict(raw_df)
# Create a copy of the DataFrame and name it as predictions_df
predictions_df = raw_df.copy()
# Add a class column with the labels to the predictions_df DataFrame
predictions_df['predict'] = predict
predictions_df.head()

In [None]:
# Plot the clusters - Area Loss & Temp
predictions_df.hvplot.scatter(
    x="Area Shrinkage (mi^2)",
    y="Monthly Average Mean Temperature (degF)",
    by="predict"
).opts(yformatter="%.0f")

In [None]:
# Plot the clusters - Area Loss & Precip
predictions_df.hvplot.scatter(
    x="Area Shrinkage (mi^2)",
    y="Monthly Total Precipitation (in)",
    by="predict"
).opts(yformatter="%.0f")

In [None]:
# Plot the clusters - Area Loss & Snow
predictions_df.hvplot.scatter(
    x="Area Shrinkage (mi^2)",
    y="Monthly Total Snowfall (in)",
    by="predict"
).opts(yformatter="%.0f")

In [None]:
#length appears to disproportionately affect this
#optimizing attempt: comparing only area and weather data
areaonly=raw_df[['Monthly Average Mean Temperature (degF)','Monthly Total Precipitation (in)', 'Monthly Total Snowfall (in)','Area (mi^2)_1986','Area (mi^2)_2020','Area Shrinkage (mi^2)']]
areaonly.head()

In [None]:
# Use the StandardScaler module and fit_transform function to 
# scale all columns with numerical values
areaonly_scaled = StandardScaler().fit_transform(areaonly[['Monthly Average Mean Temperature (degF)', 'Monthly Total Precipitation (in)', 'Monthly Total Snowfall (in)', 'Area (mi^2)_1986', 'Area (mi^2)_2020', 'Area Shrinkage (mi^2)']])

# Diplay the first three rows of the scaled data
areaonly_scaled[0:3]

In [None]:
#computing intertia for all k values for use in elbow graphing
inertia = []
k = list(range(1, 11))
for i in k:
    k_model2 = KMeans(n_clusters=i, random_state=1)
    k_model2.fit(areaonly_scaled)
    inertia.append(k_model.inertia_)
elbow_data2 = {"k": k, "inertia": inertia}
elbow_df2 = pd.DataFrame(elbow_data)
elbow_df2.head()

In [None]:
#plotting the elbow curve
elbow_df2.hvplot.line(
    x="k", 
    y="inertia",
    title="Elbow Curve",
    xticks=k
)

In [None]:
# Define the model with the lower value of k clusters
# Use a random_state of 1 to generate the model
model = KMeans(n_clusters=4, random_state=1)
# Fit the model
model.fit(areaonly)
# Make predictions
predict = model.predict(areaonly)
# Create a copy of the DataFrame and name it as predictions_df
predictions_df2 = areaonly.copy()
# Add a class column with the labels to the predictions_df DataFrame
predictions_df2['predict'] = predict
predictions_df2.head()

In [None]:
# Plot the clusters - Area Loss & Temp
predictions_df2.hvplot.scatter(
    x="Area Shrinkage (mi^2)",
    y="Monthly Average Mean Temperature (degF)",
    by="predict"
).opts(yformatter="%.0f")

In [None]:
# Plot the clusters - Area Loss & Precip
predictions_df2.hvplot.scatter(
    x="Area Shrinkage (mi^2)",
    y="Monthly Total Precipitation (in)",
    by="predict"
).opts(yformatter="%.0f")

In [None]:
# Plot the clusters - Area Loss & Snow
predictions_df2.hvplot.scatter(
    x="Area Shrinkage (mi^2)",
    y="Monthly Total Snowfall (in)",
    by="predict"
).opts(yformatter="%.0f")

In [None]:
#Maybe initial loss has an impact?
# Plot the clusters - Area Loss & Initial Area
predictions_df2.hvplot.scatter(
    x="Area Shrinkage (mi^2)",
    y="Area (mi^2)_1986",
    by="predict"
).opts(yformatter="%.0f")

In [None]:
#that is a much more useful plot
#prior to dropping length & height-
predictions_df.hvplot.scatter(
    x="Area Shrinkage (mi^2)",
    y="Area (mi^2)_1986",
    by="predict"
).opts(yformatter="%.0f")

In [None]:
#in both cases, it appears glaciers appear to be shrinking relative to their size; larger glaciers are shrinking faster.
#however, as evidenced by the horizontinality of the data, this is just one factor

In [None]:
#..