In [None]:
import pandas as pd
import hvplot.pandas
from pathlib import Path
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

In [None]:
#Connect to the database
conn_string = "host='localhost' dbname='glacier_data' user='postgres' password='REDACTED'"
#password has been removed for security reasons
conn = psycopg2.connect(conn_string)
print("Database opened successfully")
cursor = conn.cursor();
query = 'select * from yourdatabase'
cursor.execute(query)
colnames = [desc[0] for desc in cursor.description]
df = pd.DataFrame(cursor.fetchall(), columns=colnames)
cursor.close()
print('Database closed successfully!')

In [None]:
raw_df = pd.read_csv(Path("../Data/UMLprep.csv"))
raw_df.head()

In [None]:
raw_df = raw_df.dropna()
# Use the StandardScaler module and fit_transform function to 
# scale all columns with numerical values
raw_df_scaled = StandardScaler().fit_transform(raw_df[['Year','Monthly Average Mean Temperature (degF)', 'Monthly Total Precipitation (in)', 'Monthly Total Snowfall (in)', 'Area (mi^2)', 'Height (ft)', 'Length (ft)']])

# Diplay the first three rows of the scaled data
raw_df_scaled[0:3]

In [None]:
#computing intertia for all k values for use in elbow graphing
inertia = []
k = list(range(1, 11))
for i in k:
    k_model = KMeans(n_clusters=i, random_state=1)
    k_model.fit(raw_df)
    inertia.append(k_model.inertia_)
elbow_data = {"k": k, "inertia": inertia}
elbow_df = pd.DataFrame(elbow_data)
elbow_df.head()

In [None]:
#plotting the elbow curve
elbow_df.hvplot.line(
    x="k", 
    y="inertia",
    title="Elbow Curve",
    xticks=k
)

In [None]:
# Define the model with the lower value of k clusters
# Use a random_state of 1 to generate the model
model = KMeans(n_clusters=4, random_state=1)
# Fit the model
model.fit(raw_df)
# Make predictions
predict = model.predict(raw_df)
# Create a copy of the DataFrame and name it as predictions_df
predictions_df = raw_df.copy()
# Add a class column with the labels to the predictions_df DataFrame
predictions_df['predict'] = predict
predictions_df.head()

In [None]:
# Plot the clusters - Area & Temp
predictions_df.hvplot.scatter(
    x="Area (mi^2)",
    y="Monthly Average Mean Temperature (degF)",
    by="predict"
).opts(yformatter="%.0f")

In [None]:
# Plot the clusters - Area & Precip
predictions_df.hvplot.scatter(
    x="Area (mi^2)",
    y="Monthly Total Precipitation (in)",
    by="predict"
).opts(yformatter="%.0f")

In [None]:
# Plot the clusters - Area & Snow
predictions_df.hvplot.scatter(
    x="Area (mi^2)",
    y="Monthly Total Snowfall (in)",
    by="predict"
).opts(yformatter="%.0f")

In [None]:
# Plot the clusters - Height & Temp
predictions_df.hvplot.scatter(
    x="Height (ft)",
    y="Monthly Average Mean Temperature (degF)",
    by="predict"
).opts(yformatter="%.0f")

In [None]:
# Plot the clusters - Height & Precip
predictions_df.hvplot.scatter(
    x="Height (ft)",
    y="Monthly Total Precipitation (in)",
    by="predict"
).opts(yformatter="%.0f")

In [None]:
# Plot the clusters - Height & Snow
predictions_df.hvplot.scatter(
    x="Height (ft)",
    y="Monthly Total Snowfall (in)",
    by="predict"
).opts(yformatter="%.0f")

In [None]:
# Plot the clusters - Length & Temp
predictions_df.hvplot.scatter(
    x="Length (ft)",
    y="Monthly Average Mean Temperature (degF)",
    by="predict"
).opts(yformatter="%.0f")

In [None]:
# Plot the clusters - Length & Precip
predictions_df.hvplot.scatter(
    x="Length (ft)",
    y="Monthly Total Precipitation (in)",
    by="predict"
).opts(yformatter="%.0f")

In [None]:
# Plot the clusters - Length & Snow
predictions_df.hvplot.scatter(
    x="Length (ft)",
    y="Monthly Total Snowfall (in)",
    by="predict"
).opts(yformatter="%.0f")

In [None]:
#optimizing the model: removing the values which appear to be outliers
df2 = raw_df[raw_df['Area (mi^2)'] <= 600]
df2.shape

In [None]:
# Use the StandardScaler module and fit_transform function to 
# scale all columns with numerical values
df2_scaled = StandardScaler().fit_transform(df2[['Year','Monthly Average Mean Temperature (degF)', 'Monthly Total Precipitation (in)', 'Monthly Total Snowfall (in)', 'Area (mi^2)', 'Height (ft)', 'Length (ft)']])

# Diplay the first three rows of the scaled data
df2_scaled[0:3]

In [None]:
#computing intertia for all k values for use in elbow graphing
inertia = []
k = list(range(1, 11))
for i in k:
    k_model = KMeans(n_clusters=i, random_state=1)
    k_model.fit(df2_scaled)
    inertia.append(k_model.inertia_)
elbow_data = {"k": k, "inertia": inertia}
elbow_df = pd.DataFrame(elbow_data)
elbow_df.head()

In [None]:
#plotting the elbow curve
elbow_df.hvplot.line(
    x="k", 
    y="inertia",
    title="Elbow Curve",
    xticks=k
)

In [None]:
# Define the model with the lower value of k clusters
# Use a random_state of 1 to generate the model
model = KMeans(n_clusters=5, random_state=1)
# Fit the model
model.fit(df2)
# Make predictions
predict = model.predict(df2)
# Create a copy of the DataFrame and name it as predictions_df
predictions_df = df2.copy()
# Add a class column with the labels to the predictions_df DataFrame
predictions_df['predict'] = predict
predictions_df.head()

In [None]:
# Plot the clusters - Area & Temp
predictions_df.hvplot.scatter(
    x="Area (mi^2)",
    y="Monthly Average Mean Temperature (degF)",
    by="predict"
).opts(yformatter="%.0f")

In [None]:
# Plot the clusters - Area & Precip
predictions_df.hvplot.scatter(
    x="Area (mi^2)",
    y="Monthly Total Precipitation (in)",
    by="predict"
).opts(yformatter="%.0f")

In [None]:
# Plot the clusters - Area & Snow
predictions_df.hvplot.scatter(
    x="Area (mi^2)",
    y="Monthly Total Snowfall (in)",
    by="predict"
).opts(yformatter="%.0f")

In [None]:
# Plot the clusters - Height & Temp
predictions_df.hvplot.scatter(
    x="Height (ft)",
    y="Monthly Average Mean Temperature (degF)",
    by="predict"
).opts(yformatter="%.0f")

In [None]:
# Plot the clusters - Height & Precip
predictions_df.hvplot.scatter(
    x="Height (ft)",
    y="Monthly Total Precipitation (in)",
    by="predict"
).opts(yformatter="%.0f")

In [None]:
# Plot the clusters - Height & Snow
predictions_df.hvplot.scatter(
    x="Height (ft)",
    y="Monthly Total Snowfall (in)",
    by="predict"
).opts(yformatter="%.0f")

In [None]:
# Plot the clusters - Length & Temp
predictions_df.hvplot.scatter(
    x="Length (ft)",
    y="Monthly Average Mean Temperature (degF)",
    by="predict"
).opts(yformatter="%.0f")

In [None]:
# Plot the clusters - Length & Precip
predictions_df.hvplot.scatter(
    x="Length (ft)",
    y="Monthly Total Precipitation (in)",
    by="predict"
).opts(yformatter="%.0f")

In [None]:
# Plot the clusters - Length & Snow
predictions_df.hvplot.scatter(
    x="Length (ft)",
    y="Monthly Total Snowfall (in)",
    by="predict"
).opts(yformatter="%.0f")

In [None]:
#none of this data appears useful. from here we tried an SML.

In [None]:
#..