In [13]:
import pandas as pd
import hvplot.pandas
from pathlib import Path
from sklearn.cluster import KMeans
from sklearn import tree
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [None]:
#Connect to the database
conn_string = "host='localhost' dbname='glacier_data' user='postgres' password='REDACTED'"
#password has been removed for security reasons
conn = psycopg2.connect(conn_string)
print("Database opened successfully")
cursor = conn.cursor();
query = 'select * from yourdatabase'
cursor.execute(query)
colnames = [desc[0] for desc in cursor.description]
df = pd.DataFrame(cursor.fetchall(), columns=colnames)
cursor.close()
print('Database closed successfully!')

In [14]:
#read in data
df = pd.read_csv(Path("../Data/SMLprep.csv"))
df.head()

Unnamed: 0,Id,Name,Lat,Lon,Area (mi^2)_1986,Area (mi^2)_2020,Area Shrinkage (mi^2),Shrinkage,Height (ft),Length (ft)
0,RGI60-01.00002,,63.404,-146.668,0.324903,0.224479,0.100425,True,1414.111,3927.357
1,RGI60-01.00003,,63.376,-146.08,0.4361,0.530965,-0.094865,False,1880.013,6909.786
2,RGI60-01.00004,,63.381,-146.12,0.763089,1.232896,-0.469807,False,3425.364,13698.175
3,RGI60-01.00005,,63.551,-147.057,0.061158,0.923282,-0.862124,False,2700.263,9780.661
4,RGI60-01.00006,,63.571,-146.244,2.799382,3.721622,-0.922239,False,7697.226,34509.558


In [15]:
#restricting to columns useful for analysis
df=df[['Lat','Lon','Area (mi^2)_1986','Area (mi^2)_2020','Shrinkage','Height (ft)','Length (ft)']]
df.head()

Unnamed: 0,Lat,Lon,Area (mi^2)_1986,Area (mi^2)_2020,Shrinkage,Height (ft),Length (ft)
0,63.404,-146.668,0.324903,0.224479,True,1414.111,3927.357
1,63.376,-146.08,0.4361,0.530965,False,1880.013,6909.786
2,63.381,-146.12,0.763089,1.232896,False,3425.364,13698.175
3,63.551,-147.057,0.061158,0.923282,False,2700.263,9780.661
4,63.571,-146.244,2.799382,3.721622,False,7697.226,34509.558


In [16]:
#defining variable and features
y=df['Shrinkage'] #target variable
X=df.drop(columns='Shrinkage') #features
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)
X_train.shape

(10339, 6)

In [17]:
#shape of testing data
X_test.shape

(3447, 6)

In [18]:
#logistic regression
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs', max_iter=200, random_state=1)
classifier

In [19]:
#logistic regression
classifier.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [20]:
#generating and summarizing results
predictions = classifier.predict(X_test)
results = pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)
cm=confusion_matrix(y_test,predictions)
cm_df=pd.DataFrame(cm, index=['Actual 0', 'Actual 1'], columns=['Predicted 0', 'Predicted 1'])
acc_score=accuracy_score(y_test, predictions)
print('--- --- ---    -    --- --- ---    -    --- --- ---')
print('Confusion Matrix')
display(cm_df)
print('--- --- ---    -    --- --- ---    -    --- --- ---')
print(f'Accuracy Score: {acc_score}')
print('--- --- ---    -    --- --- ---    -    --- --- ---')
print('Classification Report')
print(classification_report(y_test, predictions))
print('--- --- ---    -    --- --- ---    -    --- --- ---')

--- --- ---    -    --- --- ---    -    --- --- ---
Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,278,256
Actual 1,0,2913


--- --- ---    -    --- --- ---    -    --- --- ---
Accuracy Score: 0.9257325210327821
--- --- ---    -    --- --- ---    -    --- --- ---
Classification Report
              precision    recall  f1-score   support

       False       1.00      0.52      0.68       534
        True       0.92      1.00      0.96      2913

    accuracy                           0.93      3447
   macro avg       0.96      0.76      0.82      3447
weighted avg       0.93      0.93      0.92      3447

--- --- ---    -    --- --- ---    -    --- --- ---


In [21]:
#accurate with 92%; strangely enough 100% with identifying shrinkers but only about 50% with nonshrinkers
#this means the AI is overfitting to class 1.
#we can infer that 
#1- its easier to predict a glacier shrinking but when it doesnt shrink there is not a clear answer (likely because these are anomalous)
#and 2- the majority of glaciers are shrinking to the point the model itself overfits to the shrinking category

In [22]:
#optimizing the model: using class weight to help 
classifier2 = LogisticRegression(class_weight='balanced',solver='lbfgs', max_iter=200, random_state=1)
classifier2

In [23]:
#logistic regression
classifier2.fit(X_train, y_train)

In [24]:
#generating and summarizing results
predictions = classifier2.predict(X_test)
results = pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)
cm=confusion_matrix(y_test,predictions)
cm_df=pd.DataFrame(cm, index=['Actual 0', 'Actual 1'], columns=['Predicted 0', 'Predicted 1'])
acc_score2=accuracy_score(y_test, predictions)
acc_diff=acc_score-acc_score2
print('--- --- ---    -    --- --- ---    -    --- --- ---')
print('Confusion Matrix')
display(cm_df)
print('--- --- ---    -    --- --- ---    -    --- --- ---')
print(f'Accuracy Score: {acc_score2}')
print(f'Accuracy Score Loss: {acc_diff}')
print('--- --- ---    -    --- --- ---    -    --- --- ---')
print('Classification Report')
print(classification_report(y_test, predictions))
print('--- --- ---    -    --- --- ---    -    --- --- ---')

--- --- ---    -    --- --- ---    -    --- --- ---
Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,530,4
Actual 1,278,2635


--- --- ---    -    --- --- ---    -    --- --- ---
Accuracy Score: 0.918189730200174
Accuracy Score Loss: 0.007542790832608115
--- --- ---    -    --- --- ---    -    --- --- ---
Classification Report
              precision    recall  f1-score   support

       False       0.66      0.99      0.79       534
        True       1.00      0.90      0.95      2913

    accuracy                           0.92      3447
   macro avg       0.83      0.95      0.87      3447
weighted avg       0.95      0.92      0.92      3447

--- --- ---    -    --- --- ---    -    --- --- ---


In [25]:
#at a drop of only half a percent accuracy, we see a much more even distribution of overall precision

#False class performance has improved dramatically:
#Recall increased from 0.52 to 0.99, meaning fewer False Negatives.
#Precision is still moderate (0.66), but the trade-off is acceptable, considering the large improvement in recall.
#F1-Score for the False class has increased from 0.68 to 0.79, indicating a better balance between precision and recall.
#True class performance remains very strong:

#Precision is nearly perfect (above 0.99), meaning almost all predictions of True are correct.
#Recall dropped slightly (from 1.00 to 0.90), but this is not a huge issue since the model still catches a large proportion of True instances.
#F1-Score remains high (0.95).

In [26]:
#our experiences building and exploring the SML led us to want to return to UML, so we built a second UML as well
#..