In [None]:
# Need to uninstall these existing versions to upgrade libraries
%pip uninstall -y matplotlib
%pip uninstall -y plotly

In [None]:
# click restart runtime (below) after install
%pip install seaborn

In [None]:
%pip install --upgrade matplotlib
%pip install --upgrade plotly
%pip install --upgrade seaborn


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%pip install geopandas
import geopandas as gpd
import plotly.express as px

import os
import sys
from importlib_metadata import version


In [None]:

import sklearn

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, RepeatedKFold, RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import cross_validate, cross_val_predict, cross_val_score

from sklearn.preprocessing import StandardScaler, MinMaxScaler

from sklearn.metrics import accuracy_score, recall_score, precision_score, roc_auc_score, f1_score
from sklearn.metrics import roc_curve, auc, precision_recall_curve
from sklearn.metrics import confusion_matrix, classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier

In [None]:

import imblearn 
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

from imblearn.pipeline import Pipeline as imbpipeline
from imblearn.pipeline import make_pipeline


In [None]:

from google.colab import data_table
%load_ext google.colab.data_table

In [None]:

import warnings
warnings.filterwarnings("ignore")


In [None]:

pd.options.display.float_format = '{:.3f}'.format
font = {'size'   : 14}
plt.rc('font', **font)


In [None]:

# For check current version of libraries
def get_version_number(lib):
  return float('.'.join(version(lib).split('.')[0:2]))


# Load and clean dataset

In [None]:

df = pd.read_csv('../../data/modeling_data/release_risk/filtered_release_risk_modeling_dataset.csv')
df = df.drop(['sum_disposal_sites'], axis=1)
df = df.dropna()
df.shape

In [None]:
# remove columns that contain only zeros
df = df.loc[:,list(((df==0).sum()/df.shape[0])!=1)]

In [None]:
# remove case with inf value
df = df.drop(df['pop_density_acs_2018'][(df['pop_density_acs_2018']==np.inf)].index, axis=0)

# Data prep


In [None]:
Xy = df[['response',
          'ALAND10',
          'AWATER10',
          'county',
          'places',
          'ALL OTHER CONVERTED PAPER PRODUCT MANUFACTURING',
          'ALL OTHER MISCELLANEOUS TEXTILE PRODUCT MILLS',
          'ARTIFICIAL & SYNTHETIC FIBERS & FILAMENTS MFG',
          'BROADWOVEN FABRIC MILLS',
          'CARPET & RUG MILLS',
          'CORRUGATED & SOLID FIBER BOX MANUFACTURING',
          'CURTAIN & LINEN MILLS',
          'FABRIC COATING MILLS',
          'FIBER YARN & THREAD MILLS',
          'FOLDING PAPERBOARD BOX MANUFACTURING',
          'KNIT FABRIC MILLS',
          "MEN'S & BOY'S CUT & SEW APPAREL MANUFACTURING",
          'NARROW FABRIC MILLS & SCHIFFLI MACHINE EMBROIDERY',
          'NONWOVEN FABRIC MILLS',
          'OTHER APPAREL KNITTING MILLS',
          'OTHER CUT & SEW APPAREL MANUFACTURING',
          'OTHER PAPERBOARD CONTAINER MANUFACTURING',
          'PAPER BAG & COATED & TREATED PAPER MANUFACTURING',
          'PLASTICS MATERIAL & RESIN MANUFACTURING',
          'SANITARY PAPER PRODUCT MANUFACTURING',
          'STATIONERY PRODUCT MANUFACTURING',
          'TEXTILE & FABRIC FINISHING MILLS',
          'TEXTILE BAG & CANVAS MILLS',
          "WOMEN'S GIRLS' & INFANTS' CUT & SEW APPAREL MFG",         
          'sum_firestations',
          'sum_industrial_sites',
          'pop_density_acs_2018',
          'sum_airports',
          'sum_army_bases',
          'sum_highways',
          'AGRICULTURAL',
          'COMMERCIAL',
          'EXEMPT',
          'FOREST',
          'INDUSTRIAL',
          'MULTIPLE-USE',
          'OPEN-SPACE',
          'RECREATIONAL',
          'RESIDENTIAL']]




In [None]:

print("Unique places:", len(df['places'].unique()))
print("Unique counties:", len(df['county'].unique()))



In [None]:
# how many cases are non-zero?
(Xy>0).sum()

In [None]:
# which variables have low max values
(df.describe().T).index[(df.describe().T['max']<4).values]


In [None]:
# which variables do not have low max values
list((df.describe().T).index[(df.describe().T['max']>=4).values])


In [None]:

# binary encode features with low max values

bin_features = ['ALL OTHER CONVERTED PAPER PRODUCT MANUFACTURING',
                'ALL OTHER MISCELLANEOUS TEXTILE PRODUCT MILLS',
                'ARTIFICIAL & SYNTHETIC FIBERS & FILAMENTS MFG',
                'BROADWOVEN FABRIC MILLS',
                'CARPET & RUG MILLS',
                'CORRUGATED & SOLID FIBER BOX MANUFACTURING',
                'CURTAIN & LINEN MILLS',
                'FABRIC COATING MILLS',
                'FIBER YARN & THREAD MILLS',
                'FOLDING PAPERBOARD BOX MANUFACTURING',
                'KNIT FABRIC MILLS',
                "MEN'S & BOY'S CUT & SEW APPAREL MANUFACTURING",
                'NARROW FABRIC MILLS & SCHIFFLI MACHINE EMBROIDERY',
                'NONWOVEN FABRIC MILLS',
                'OTHER APPAREL KNITTING MILLS',
                'OTHER CUT & SEW APPAREL MANUFACTURING',
                'OTHER PAPERBOARD CONTAINER MANUFACTURING',
                'PAPER BAG & COATED & TREATED PAPER MANUFACTURING',
                'PLASTICS MATERIAL & RESIN MANUFACTURING',
                'SANITARY PAPER PRODUCT MANUFACTURING',
                'STATIONERY PRODUCT MANUFACTURING',
                'TEXTILE & FABRIC FINISHING MILLS',
                'TEXTILE BAG & CANVAS MILLS',
                "WOMEN'S GIRLS' & INFANTS' CUT & SEW APPAREL MFG"]

Xy = df[bin_features]

# set value as present True/False for each location
Xy = Xy>0
Xy = Xy.replace(True, 1)
Xy = Xy.replace(False, 0)


In [None]:
# attach remaining features

continuous_features =  ['ALAND10', 
                        'AWATER10',
                        'AGRICULTURAL',
                        'COMMERCIAL',
                        'EXEMPT',
                        'FOREST',
                        'INDUSTRIAL',
                        'MULTIPLE-USE',
                        'OPEN-SPACE',
                        'RECREATIONAL',
                        'RESIDENTIAL',
                        'sum_firestations',
                        'sum_industrial_sites',
                        'sum_airports',
                        'sum_army_bases',
                        'sum_highways']


info_vars = ['GEOID10', 'county', 'places', 'INTPTLAT10', 'INTPTLON10']

Xy = pd.concat((Xy, df[info_vars]), axis=1)
Xy['response'] = df['response']


Xy = pd.concat((Xy, df[continuous_features]), axis=1)


In [None]:
# split by response value
X_pos = Xy.loc[Xy['response']==1, ].drop('response', axis=1)
X_neg = Xy.loc[Xy['response']==0, ].drop('response', axis=1)


In [None]:

# Create table showing the proportion of features (binary encoded) are present in each response class
bin_summary_df = pd.DataFrame()
bin_summary_df['feature'] = X_pos[bin_features].columns
bin_summary_df['pos_pct'] = X_pos[bin_features].sum().values / X_pos.shape[0] * 100
bin_summary_df['neg_pct'] = X_neg[bin_features].sum().values / X_neg.shape[0] * 100

bin_summary_df['diff'] = bin_summary_df['pos_pct'] - bin_summary_df['neg_pct']
bin_summary_df['abs_diff'] = np.abs(bin_summary_df['diff'])

bin_summary_df = bin_summary_df.sort_values('diff', ascending=False)
bin_summary_df = bin_summary_df.reset_index(drop=True)

display(bin_summary_df)

bin_summary_df = bin_summary_df.sort_values('abs_diff', ascending=True)
bin_summary_df.loc[bin_summary_df['diff'] > 0, 'pos_diff'] = bin_summary_df['diff'] 
bin_summary_df.loc[bin_summary_df['diff'] < 0, 'neg_diff'] = bin_summary_df['diff'] 



In [None]:
# plot differences in proportions of features present by class

fig, ax = plt.subplots(figsize=(8,10))

labels = list(bin_summary_df['feature'])
label_ticks = np.arange(len(labels))
bar_width = .4

bars = ax.barh(label_ticks, bin_summary_df['pos_diff'], label='Response=1', color='red')
if get_version_number('matplotlib') > 3.5:
  ax.bar_label(bars, np.round(bin_summary_df['abs_diff'],1), padding=3)

bars = ax.barh(label_ticks, bin_summary_df['neg_diff'], label='Response=0', color='blue')
if get_version_number('matplotlib') > 3.5:
  ax.bar_label(bars,  np.round(bin_summary_df['abs_diff'],1), padding=3)

ax.set_xlim(-10, 10)
ax.set_xlabel('Difference between percentages of features present, by response class')

ax.set_title('Percent of locations containing each feature')
ax.set_yticks(label_ticks)
ax.set_yticklabels(labels)

ax.vlines(0, -1, len(label_ticks), color='black', linewidth=.5, alpha=.75)
ax.set_ylim(-1, len(label_ticks))

ax.grid(True, axis='x', linestyle=':', linewidth=.5, color='green', )
ax.set_axisbelow(True)

ax.legend()

plt.show()

In [None]:

# Plot just the main feature percent differences

bin_summary_plot = bin_summary_df.loc[bin_summary_df['pos_diff']>=.5,]

labels = list(bin_summary_plot['feature'])
label_ticks = np.arange(len(labels))
bar_width = .4

fig, ax = plt.subplots(figsize=(8,10))

bars = ax.barh(label_ticks, bin_summary_plot['pos_diff'], label='Response=1')
if get_version_number('matplotlib') > 3.5:
  ax.bar_label(bars, np.round(bin_summary_plot['abs_diff'],1), padding=3, fontsize=22)

ax.set_xlim(-0, 10)
ax.set_xlabel('Percent of locations containing each feature', fontdict={'fontsize':22})

ax.set_title('Percent Higher Prevelance in PFAS Release Areas', fontdict={'fontsize':22})

ax.set_yticks(label_ticks)
ax.set_yticklabels(labels, fontsize=18)

ax.vlines(0, -1, len(label_ticks), color='black', linewidth=.5, alpha=.75)
ax.set_ylim(-1, len(label_ticks))

ax.grid(True, axis='x', linestyle=':', linewidth=.5, color='green', )
ax.set_axisbelow(True)

plt.show()

In [None]:

# Create table showing mean differences and z-score differences for continuous features in each response class

# create summary table 
continous_summary_df = pd.DataFrame()
continous_summary_df['feature'] = X_pos[continuous_features].columns
continous_summary_df['Xy_mean'] = Xy[continuous_features].mean().values
continous_summary_df['Xy_std'] = Xy[continuous_features].std().values

continous_summary_df['pos_mean'] = X_pos[continuous_features].mean().values
continous_summary_df['neg_mean'] = X_neg[continuous_features].mean().values

continous_summary_df['pos_z'] = (X_pos[continuous_features].mean().values - continous_summary_df['Xy_mean']) / continous_summary_df['Xy_std']
continous_summary_df['neg_z'] = (X_neg[continuous_features].mean().values - continous_summary_df['Xy_mean']) / continous_summary_df['Xy_std']

continous_summary_df['diff'] = continous_summary_df['pos_z'] - continous_summary_df['neg_z']
continous_summary_df['abs_diff'] = np.abs(continous_summary_df['diff'])

continous_summary_df = continous_summary_df.sort_values('abs_diff', ascending=False)
continous_summary_df = continous_summary_df.reset_index(drop=True)

display(continous_summary_df)

continous_summary_df = continous_summary_df.sort_values('abs_diff', ascending=True)
continous_summary_df.loc[continous_summary_df['diff'] > 0, 'pos_diff'] = continous_summary_df['diff'] 
continous_summary_df.loc[continous_summary_df['diff'] < 0, 'neg_diff'] = continous_summary_df['diff'] 


In [None]:
# plot differences in z-scores for continous features by class

fig, ax = plt.subplots(figsize=(8,10))

labels = list(continous_summary_df['feature'])
label_ticks = np.arange(len(labels))

bar_width = .4

bars = ax.barh(label_ticks, continous_summary_df['pos_diff'], label='Response=1', color='red')
if get_version_number('matplotlib') > 3.5:
  ax.bar_label(bars, np.round(continous_summary_df['abs_diff'],1), padding=3)

bars = ax.barh(label_ticks, continous_summary_df['neg_diff'], label='Response=0', color='blue')
if get_version_number('matplotlib') > 3.5:
  ax.bar_label(bars,  np.round(continous_summary_df['abs_diff'],1), padding=3)

ax.set_xlim(-1, 1)
ax.set_title('Difference between z-scores of values for each response class (of overall mean)')

ax.set_xlabel('z-score difference')
ax.set_yticks(label_ticks)
ax.set_yticklabels(labels)

ax.vlines(0, -1, len(label_ticks), color='black', linewidth=.5, alpha=.75)
ax.set_ylim(-1, len(label_ticks))

ax.grid(True, axis='x', linestyle=':', linewidth=.5, color='green', )
ax.set_axisbelow(True)

ax.legend()

plt.show()

In [None]:
# plot just the main feature z score differences, for presentation

continous_summary_plot = continous_summary_df.loc[continous_summary_df['abs_diff'] > 0.1,]

fig, ax = plt.subplots(figsize=(8,10))

labels = list(continous_summary_plot['feature'])
label_ticks = np.arange(len(labels))

bar_width = .4

bars = ax.barh(label_ticks, continous_summary_plot['pos_diff'], label='PFAS Release Area', color='red')
if get_version_number('matplotlib') > 3.5:
  ax.bar_label(bars, np.round(continous_summary_plot['abs_diff'],1), padding=3)

bars = ax.barh(label_ticks, continous_summary_plot['neg_diff'], label='No Documented Release', color='blue')
if get_version_number('matplotlib') > 3.5:
  ax.bar_label(bars,  np.round(continous_summary_plot['abs_diff'],1), padding=3)

ax.set_xlim(-1, 1)
ax.set_title('Relative difference in average values for area characteristics\n', fontdict={'fontsize':22})
ax.set_xlabel('z-score difference', fontsize=18)

ax.set_yticks(label_ticks)
ax.set_yticklabels(labels, fontsize=18)

ax.vlines(0, -1, len(label_ticks), color='black', linewidth=.5, alpha=.75)
ax.set_ylim(-1, len(label_ticks))

ax.grid(True, axis='x', linestyle=':', linewidth=.5, color='green', )
ax.set_axisbelow(True)

plt.legend(bbox_to_anchor=(-0.03, -.1), loc='upper left', fontsize=22, markerscale=1.5);
plt.show()

In [None]:

important_bin_features = list(bin_summary_df.loc[bin_summary_df['diff']>0, 'feature'].values)

print('\nFeatures with positive difference in proportions > 0:')
display(important_bin_features)

important_continous_features = list(continous_summary_df.loc[continous_summary_df['abs_diff']>.1, 'feature'].values)

# drop total land area variable to avoid bias due to census place size
if 'ALAND10' in important_continous_features:
  important_continous_features.remove('ALAND10')

print('\nFeatures with abs difference in mean z-scores > 0.1:')
display(important_continous_features)

print('\nImportant features selected for modeling')
important_features = important_bin_features + important_continous_features

display(important_features)


#Aggregate by location grouping

In [None]:


Xy = df[important_features]
Xy['places'] = df['places']
Xy['response'] = df['response']

Xy_aggr = Xy.groupby(['places'], as_index=False).agg('mean')

Xy_aggr.loc[Xy_aggr['response']>0, 'response'] = 1
X = Xy_aggr[important_features]
y = Xy_aggr['response']
X.shape


# Train/test split size comparison

In [None]:

# train default models and return repeated cross validation results over train/test splits of increasing size (with same random state).

seed = 42
k_folds = 5
n_repeats = 10 

model_list = [LogisticRegression(random_state=seed),
              KNeighborsClassifier(),
              RandomForestClassifier(random_state=seed)]

results = []
results_df = pd.DataFrame()


for train_size in np.arange(.5, .82, .02):
  
  X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                      stratify=y,
                                                      random_state=seed,
                                                      train_size=train_size)

  for clf in model_list:

      model_str = str(type(clf))
      print('Evaluating: {},  train_size: {:.2f} '. format(model_str, train_size))

      model = make_pipeline(
                            StandardScaler(), 
                            SMOTE(random_state=seed),
                            RandomUnderSampler(random_state=seed),
                            clf)
      
      rkf = RepeatedStratifiedKFold(n_splits=k_folds, 
                                    n_repeats=n_repeats,
                                    random_state=seed)

      cv_results = cross_validate(model, 
                                  X_train, y_train, 
                                  cv=rkf,
                                  scoring=['accuracy','recall', 'precision', 'f1', 'roc_auc'])

      results.append({'model':model_str, 
                      'train_size':train_size,
                      'cv_accuracy':cv_results['test_accuracy'].mean(),  
                      'cv_precision':cv_results['test_precision'].mean(),                      
                      'cv_recall':cv_results['test_recall'].mean(),
                      'cv_f1':cv_results['test_f1'].mean(),
                      'cv_ROC':cv_results['test_roc_auc'].mean()})


results_df = pd.DataFrame(results)


In [None]:

# plot train size results comparison
sns.set_style("whitegrid")

g = sns.FacetGrid(data=results_df.melt(['model', 'train_size']), col='variable', hue='model', height=4, aspect=.9)
g.map_dataframe(sns.lineplot, x='train_size', y='value')
plt.ylim(0,1)
plt.xticks(np.arange(.5, .85, .1))
plt.legend(bbox_to_anchor=(-4.5, -.4), loc='upper left'); 


# Parameter Tuning

In [None]:

seed = 42

train_size = .65
k_folds = 5
prob_threshold = .5
n_repeats = 50


X_train, X_test, y_train, y_test = train_test_split(X, y,
                                        stratify=y,
                                        train_size=train_size,
                                        random_state=seed
                                        )

model_list = [
              LogisticRegression(random_state=seed),
              KNeighborsClassifier(),
              RandomForestClassifier(random_state=seed)
              ]


params = [
          {'smote__sampling_strategy': [0.5, 0.6, 0.8, 1.0],
          'logisticregression__C': (0.01, 0.1, 0.2, 0.5, 0.8, 1, 2, 5, 10),
          'logisticregression__solver': ['lbfgs', 'liblinear', 'sag'],
          'logisticregression__penalty': ['l1', 'l2'],
          'logisticregression__class_weight': ['balanced', None],},

          {'smote__sampling_strategy': [0.5, 0.6, 0.8, 1.0],
           'kneighborsclassifier__n_neighbors': np.arange(1,25,1),},
          
          {'smote__sampling_strategy': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
          'randomforestclassifier__n_estimators': [5,10,20,50], #,100,500],
          'randomforestclassifier__criterion': ['gini', 'entropy'],
          'randomforestclassifier__max_depth': [1, 2, 3, 4, 5],
          'randomforestclassifier__min_samples_split': [2, 3, 5, 10],
          'randomforestclassifier__min_samples_leaf': [1, 2, 4, 8],
           }]
                     

models=[]
best_params = []
model_results_df = pd.DataFrame()

print('\ntrain_size:', train_size,
      '\nk_folds:', k_folds,
      '\nn_repeats:', n_repeats)
      

for clf, param in zip(model_list, params):
  
    model_str = str(type(clf))
    print('\nmodel: ',model_str)

    model = make_pipeline(
                          StandardScaler(),
                          SMOTE(random_state=seed),
                          RandomUnderSampler(random_state=seed),
                          clf)

    kf = StratifiedKFold(n_splits=k_folds, 
                        random_state=seed, 
                        shuffle=True)
    
    rkf = RepeatedStratifiedKFold(n_splits=k_folds, 
                                  n_repeats=n_repeats,
                                  random_state=seed
                                  )

    # perform search of parameters for each model, saving best configuration to model object.
    model = RandomizedSearchCV(model,
                               param,
                               cv=kf,
                               # random_state=seed,
                               n_iter=10,
                               scoring='recall')
    
    # rerun cross validation using best params and repeated stratified k-fold to control for model instability due to limited training data
    cv_results = cross_validate(model, 
                                X_train, y_train, 
                                cv=rkf,
                                scoring=['precision', 'accuracy', 'recall', 'f1', 'roc_auc']
                                )

    # output results
    print('cv_accuracy   mean: {:.2f} | sd: {:.2f}'. format(cv_results['test_accuracy'][~np.isnan(cv_results['test_accuracy'])].mean(), 
                                                            cv_results['test_accuracy'][~np.isnan(cv_results['test_accuracy'])].std()))
    
    print('cv_recall     mean: {:.2f} | sd: {:.2f}'. format(cv_results['test_recall'][~np.isnan(cv_results['test_recall'])].mean(), 
                                                            cv_results['test_recall'][~np.isnan(cv_results['test_recall'])].std()))
    
    print('cv_precision  mean: {:.2f} | sd: {:.2f}'. format(cv_results['test_precision'][~np.isnan(cv_results['test_precision'])].mean(), 
                                                            cv_results['test_precision'][~np.isnan(cv_results['test_precision'])].std()))
    
    print('cv_f1         mean: {:.2f} | sd: {:.2f}'. format(cv_results['test_f1'][~np.isnan(cv_results['test_f1'])].mean(), 
                                                            cv_results['test_f1'][~np.isnan(cv_results['test_f1'])].std()))
    
    print('cv_ROC        mean: {:.2f} | sd: {:.2f}'. format(cv_results['test_roc_auc'][~np.isnan(cv_results['test_roc_auc'])].mean(), 
                                                            cv_results['test_roc_auc'][~np.isnan(cv_results['test_roc_auc'])].std()))

    # save cv results for comparison of models and creation of ensembles
    cv_results_df = pd.DataFrame()
    cv_results_df['model'] = ''
    cv_results_df['test_accuracy'] = cv_results['test_accuracy']
    cv_results_df['test_recall'] = cv_results['test_recall']
    cv_results_df['test_precision'] = cv_results['test_precision']
    cv_results_df['test_f1'] = cv_results['test_f1']
    cv_results_df['test_roc_auc'] = cv_results['test_roc_auc']
    cv_results_df['model'] = model_str

    model_results_df = model_results_df.append(cv_results_df, ignore_index=True)
    model_results_df.reset_index(drop=True, inplace=True)
    
    # save fitted models with best params for testing
    model = model.fit(X_train, y_train)
    models.append(model)

    print('\nbest params:')
    print(model.best_params_)
    best_params.append(model.best_params_)



In [None]:

# Shorten model names
model_results_df.loc[model_results_df['model']=="<class 'sklearn.linear_model._logistic.LogisticRegression'>", 'model'] = 'Logistic Regression'
model_results_df.loc[model_results_df['model']=="<class 'sklearn.neighbors._classification.KNeighborsClassifier'>", 'model'] = 'K-Nearest Neighbors'
model_results_df.loc[model_results_df['model']=="<class 'sklearn.ensemble._forest.RandomForestClassifier'>", 'model'] = 'Random Forest'


In [None]:
# CV results table
np.round(model_results_df.groupby('model', as_index=False).agg('mean').sort_values('test_recall', ascending=False),3)


In [None]:

# plot cross validation scoring results

font = {'size'   : 14}
plt.rc('font', **font)

fig = plt.figure(figsize=(9,6))

sns.boxplot(data=model_results_df.melt('model'), y='value', x='variable', hue='model', width=.7,
            fliersize=0, whis=1.5, linewidth=.5
            )

plt.xticks(ticks=np.arange(0, 5), labels=['Accuracy', 'Recall', 'Precision', 'F1', 'ROC AUC'], 
          #  rotation=45, rotation_mode='anchor', ha='right'
           )

plt.ylabel('Score')
plt.legend(bbox_to_anchor=(1.01, 1), loc='upper left')
plt.title('K-Fold Cross Validation Metric Results\n');


# Predict on Test Set

In [None]:

# get predictions from models for test set

test_pred_df = pd.DataFrame()
test_pred_df['response'] = y_test
test_prob_df = pd.DataFrame()
test_prob_df['response'] = y_test

model_names = ['LR_prob', 'KNN_prob', 'RF_prob']

model_list_str = [str(type(m)) for m in model_list]

for i,model in enumerate(models):

    y_prob = model.predict_proba(X_test)[:,1]
    y_pred = y_prob > prob_threshold

    test_pred_df[model_names[i]] = y_prob

    print(model_list_str[i])
    print('test accuracy:  {:.2f}'. format(accuracy_score(y_test, y_pred)))
    print('test recall:    {:.2f}'. format(recall_score(y_test, y_pred)))
    print('test precision: {:.2f}'. format(precision_score(y_test, y_pred)))
    print('test roc_auc:   {:.2f}'. format(roc_auc_score(y_test, y_pred)))
    print('test f1:        {:.2f}'. format(f1_score(y_test, y_pred)))

    # print('\nClassification report : \n', 
    #       classification_report(y_test, y_pred, labels=[1,0]))

    tp, fn, fp, tn = confusion_matrix(y_test, y_pred, labels=[1,0]).reshape(-1)
    print('\nOutcome values :',
          '\ntp:', tp, 
          '\nfn:', fn, 
          '\nfp:', fp, 
          '\ntn:', tn)
    print('\n------------\n')
    


In [None]:

# predicted class = 1 where probability > prob_threshold 

test_pred_df.loc[test_pred_df['LR_prob'] > prob_threshold, 'LR_pred'] = 1
test_pred_df.loc[test_pred_df['LR_prob'] <= prob_threshold, 'LR_pred'] = 0

test_pred_df.loc[test_pred_df['KNN_prob'] > prob_threshold, 'KNN_pred'] = 1
test_pred_df.loc[test_pred_df['KNN_prob'] <= prob_threshold, 'KNN_pred'] = 0

test_pred_df.loc[test_pred_df['RF_prob'] > prob_threshold, 'RF_pred'] = 1
test_pred_df.loc[test_pred_df['RF_prob'] <= prob_threshold, 'RF_pred'] = 0


# compute soft voting ensemble probability and prediction
test_pred_df['prob_avg'] = test_pred_df[['LR_prob', 
                                         'KNN_prob', 
                                         'RF_prob']].mean(axis=1)

test_pred_df.loc[test_pred_df['prob_avg'] > prob_threshold, 'soft_vote'] = 1
test_pred_df.loc[test_pred_df['prob_avg'] <= prob_threshold, 'soft_vote'] = 0

# compute hard voting ensemble prediction 
test_pred_df['hard_vote'] = test_pred_df[['LR_pred', 
                                          'KNN_pred', 
                                          'RF_pred']].mode(axis=1)



In [None]:

# categorize prediction outcomes

pred_df = test_pred_df.drop(['LR_prob', 'KNN_prob', 'RF_prob', 'prob_avg'], axis=1)

pred_df = pred_df.melt('response', var_name='model', value_name='pred')

pred_df.loc[(pred_df['pred']==1) & (pred_df['response']==1), 'pred_result'] = 'True Positive'
pred_df.loc[(pred_df['pred']==0)  & (pred_df['response']==1), 'pred_result'] = 'False Negative'
pred_df.loc[(pred_df['pred']==1) & (pred_df['response']==0), 'pred_result'] = 'False Positive'
pred_df.loc[(pred_df['pred']==0)  & (pred_df['response']==0), 'pred_result'] = 'True Negative'



In [None]:

# sum up prediction results in crosstab table
pred_ct = pd.crosstab(pred_df['model'],pred_df['pred_result'])

pred_ct['total'] = pred_ct[['True Positive', 'True Negative', 'False Positive', 'False Negative']].sum(axis=1)
pred_ct['correct'] = pred_ct['True Positive'] + pred_ct['True Negative']

pred_ct['Accuracy'] = pred_ct['correct'] / pred_ct['total']
pred_ct['Recall'] = pred_ct['True Positive'] / (pred_ct['True Positive'] + pred_ct['False Negative'])
pred_ct['Precision'] = pred_ct['True Positive'] / (pred_ct['True Positive'] + pred_ct['False Positive'])
pred_ct['F1'] = 2 * (pred_ct['Recall'] * pred_ct['Precision']) / (pred_ct['Recall'] + pred_ct['Precision'])

pred_ct = pred_ct.reset_index()
pred_ct


In [None]:
pred_ct.loc[pred_ct['model']=='KNN_pred', 'Model name'] = 'KNN'
pred_ct.loc[pred_ct['model']=='LR_pred', 'Model name'] = 'Logistic Regression'
pred_ct.loc[pred_ct['model']=='RF_pred', 'Model name'] = 'Random Forest'
pred_ct.loc[pred_ct['model']=='soft_vote', 'Model name'] = 'Ensemble - Hard Vote'
pred_ct.loc[pred_ct['model']=='hard_vote', 'Model name'] = 'Ensemble - Soft Vote'

In [None]:

# plot model evaluation results
pred_results = pred_ct[['Model name', 'Accuracy', 'Recall', 'Precision', 'F1']].melt('Model name', var_name='Metric', value_name='Score')

fig = plt.figure(figsize=(8,6))

ax = sns.barplot(data=pred_results, x='Metric', y='Score', hue='Model name')
plt.ylim(0,1)

plt.legend(bbox_to_anchor=(1.01, 1), loc='upper left')
plt.title('Model test score comparisons\n')


plt.show()





In [None]:

# Print ensemble results

# Soft voting
print('\nSoft voting:')
print(classification_report(test_pred_df['response'], test_pred_df['soft_vote'], labels=[1,0]))

tp, fn, fp, tn = confusion_matrix(test_pred_df['response'], test_pred_df['soft_vote'], labels=[1,0]).reshape(-1)
print('\nOutcome values :',
      '\ntp:', tp, 
      '\nfn:', fn, 
      '\nfp:', fp, 
      '\ntn:', tn)

# Hard voting
print('\nHard voting:')
print(classification_report(test_pred_df['response'], test_pred_df['hard_vote'], labels=[1,0]))

tp, fn, fp, tn = confusion_matrix(test_pred_df['response'], test_pred_df['hard_vote'], labels=[1,0]).reshape(-1)
print('\nOutcome values :',
      '\ntp:', tp, 
      '\nfn:', fn, 
      '\nfp:', fp, 
      '\ntn:', tn)

# Predict on Entire dataset

In [None]:

# Run trained models using best params to predict on the entire dataset.

full_pred_df = Xy_aggr.copy()
full_pred_df = full_pred_df[['places','response']]

model_names = ['LR_prob', 'KNN_prob', 'RF_prob']

for i, model in enumerate(models):

    # model = model.fit(X_train, y_train)
    y_prob = model.predict_proba(X)[:,1]
    y_pred = y_prob > prob_threshold

    full_pred_df[model_names[i]] = y_prob

    print(model_names[i])
    print('test accuracy:  {:.2f}'. format(accuracy_score(y, y_pred)))
    print('test recall:    {:.2f}'. format(recall_score(y, y_pred)))
    print('test precision: {:.2f}'. format(precision_score(y, y_pred)))
    print('test roc_auc:   {:.2f}'. format(roc_auc_score(y, y_pred)))
    print('test f1:        {:.2f}'. format(f1_score(y, y_pred)))

    # print('\nClassification report : \n', 
    #       classification_report(y_test, y_pred, labels=[1,0]))

    tp, fn, fp, tn = confusion_matrix(y, y_pred, labels=[1,0]).reshape(-1)
    print('\nOutcome values :',
          '\ntp:', tp, 
          '\nfn:', fn, 
          '\nfp:', fp, 
          '\ntn:', tn)
    print('\n------------\n')
    


In [None]:

# predicted class = 1 where probability > prob_threshold 

full_pred_df.loc[full_pred_df['LR_prob'] > prob_threshold, 'LR_pred'] = 1
full_pred_df.loc[full_pred_df['LR_prob'] <= prob_threshold, 'LR_pred'] = 0

full_pred_df.loc[full_pred_df['KNN_prob'] > prob_threshold, 'KNN_pred'] = 1
full_pred_df.loc[full_pred_df['KNN_prob'] <= prob_threshold, 'KNN_pred'] = 0

full_pred_df.loc[full_pred_df['RF_prob'] > prob_threshold, 'RF_pred'] = 1
full_pred_df.loc[full_pred_df['RF_prob'] <= prob_threshold, 'RF_pred'] = 0


# compute soft voting ensemble probability and prediction
full_pred_df['prob_avg'] = full_pred_df[['LR_prob', 
                                         'KNN_prob', 
                                         'RF_prob']].mean(axis=1)
full_pred_df.loc[full_pred_df['prob_avg'] > prob_threshold, 'soft_vote'] = 1
full_pred_df.loc[full_pred_df['prob_avg'] <= prob_threshold, 'soft_vote'] = 0

# compute hard voting ensemble prediction 
full_pred_df['hard_vote'] = full_pred_df[['LR_pred','KNN_pred', 'RF_pred']].mode(axis=1)
full_pred_df['hard_vote_prob'] = full_pred_df['hard_vote']


In [None]:

# select best model to use for final predictions

# final_model_pred = 'LR_pred'
# final_model_prob = 'LR_prob'

# final_model_pred = 'KNN_pred'
# final_model_prob = 'KNN_prob'

final_model_pred = 'RF_pred'
final_model_prob = 'RF_prob'

# final_model_pred = 'soft_vote'
# final_model_prob = 'prob_avg'

# final_model_pred = 'hard_vote'
# final_model_prob = 'hard_vote_prob'


In [None]:

# categorize prediction outcomes

pred_df = full_pred_df[['places', 'response', final_model_pred, final_model_prob]]
pred_df = pred_df.rename(columns={final_model_pred:'pred', final_model_prob:'prob'})

pred_df.loc[(pred_df['pred']==1) & (pred_df['response']==1), 'pred_result'] = 'True Positive'
pred_df.loc[(pred_df['pred']==0)  & (pred_df['response']==1), 'pred_result'] = 'False Negative'
pred_df.loc[(pred_df['pred']==1) & (pred_df['response']==0), 'pred_result'] = 'False Positive'
pred_df.loc[(pred_df['pred']==0)  & (pred_df['response']==0), 'pred_result'] = 'True Negative'


In [None]:
print(classification_report(pred_df['response'], pred_df['pred']))

In [None]:

# plot prediction outcome totals 

ax = sns.countplot(data=pred_df, x='pred_result')
if get_version_number('matplotlib') > 3.5:
  plt.bar_label(ax.containers[0])
plt.ylim(0,40)
plt.xticks(rotation=45, rotation_mode='anchor', ha='right')
plt.xlabel('')
plt.title('Prediction Outcomes for Full Dataset\n')
plt.show()


In [None]:

# plot prediction results 

pred_df = pred_df.sort_values('prob', ascending=True)
pred_df = pred_df.reset_index(drop=True)

font = {'size'   : 14}
plt.rc('font', **font)

fig = plt.figure(figsize=(8,6))

pal = {'True Positive': 'green',
       'False Positive':'red',
       'True Negative': 'blue',
       'False Negative':'#fd8d3c'}

sns.scatterplot(pred_df.index, pred_df['prob'], hue=pred_df['pred_result'], palette=pal)

plt.ylim(0,1), 
plt.yticks(np.arange(0, 1.1, .1))
plt.grid(which='major', axis='y')

plt.xlabel('Locations - ordered index')
plt.ylabel('Predicted Probability')
plt.title('Model prediction outcomes for all locations')
plt.legend(bbox_to_anchor=(1.01, 1), loc='upper left');




# Model Analysis

In [None]:

# get RF importance rankings for features

train_size = .6

clf = RandomForestClassifier()
n_loops = 20
importance_df = pd.DataFrame()

for i in range(n_loops):

    X_train, X_val, y_train, y_val = train_test_split(X, 
                                                      y,
                                                      train_size=train_size)
    
    scaler = StandardScaler()
    clf = clf.fit(scaler.fit_transform(X_train), y_train)
    importance = clf.feature_importances_
    importance_df = pd.concat([importance_df, pd.Series(importance)], axis=1)

importance_df['mean_importance'] = importance_df.mean(axis=1)
importance_df['feature'] = X_train.columns
importance_df["rank"] = importance_df['mean_importance'].rank(ascending = False)
importance_df = importance_df[['feature','mean_importance', 'rank']].sort_values('rank', axis=0, ascending=False)
importance_df.reset_index(drop=True, inplace=True)

plt.figure(figsize=(8, 16))
plt.barh(importance_df['feature'], 
         importance_df['mean_importance'])
plt.title('mean importance value from RF')
plt.show()



#Chloropleth Maps


In [None]:

# load geopandas dataframe file
cbg_gdf = gpd.read_file('../../data/tl_2010_25_bg10.zip')


In [None]:

# dissolve geopandas df geometry to places level
cbg_gdf['places'] = cbg_gdf['GEOID10'].str[:7]
places_gdf = cbg_gdf.dissolve(by='places').reset_index()


In [None]:

# add location data to predictions

model_locations = Xy_aggr[['places']]

model_locations.loc[X_train.index, 'train_test'] = 'Training Set'
model_locations.loc[X_test.index, 'train_test'] = 'Test Set'

model_locations['places'] = model_locations['places'].astype(str)
pred_df['places'] = pred_df['places'].astype(str)

pred_df = pred_df.merge(model_locations, on='places')
pred_df = pred_df.rename(columns={'train_test_y':'train_test'})


In [None]:

# categorize rows that contain a known PFAS release
pred_df.loc[(pred_df['pred_result'] == 'True Positive') | (pred_df['pred_result'] == 'False Negative'), 'Release'] = True
pred_df.loc[(pred_df['pred_result'] == 'True Negative') | (pred_df['pred_result'] == 'False Positive'), 'Release'] = False

pred_df.loc[pred_df['pred_result'] == 'True Positive', 'Prediction Outcome'] = 'Release(s) Occurred:  Correctly Predicted'
pred_df.loc[pred_df['pred_result'] == 'False Positive', 'Prediction Outcome'] = 'No Documented Release:  Higher Risk Predicted'
pred_df.loc[pred_df['pred_result'] == 'True Negative', 'Prediction Outcome'] = 'No Documented Release:  Lower Risk Predicted'
pred_df.loc[pred_df['pred_result'] == 'False Negative', 'Prediction Outcome'] = 'Release(s) Occurred:  Model Failed to Predict'

pred_df.loc[(pred_df['Release'] == True) & (pred_df['train_test'] == 'Training Set'), 'train_test_release'] = 'Training Locations: Release(s) Occurred'
pred_df.loc[(pred_df['Release'] == True) & (pred_df['train_test'] == 'Test Set'), 'train_test_release'] = 'Test Locations: Release(s) Occurred'
pred_df.loc[(pred_df['Release'] == False) & (pred_df['train_test'] == 'Training Set'), 'train_test_release'] = 'Test Locations: No Documented Release'
pred_df.loc[(pred_df['Release'] == False) & (pred_df['train_test'] == 'Test Set'), 'train_test_release'] = 'Training Locations: No Documented Release'


# merge prediction data with gdf
places_gdf = places_gdf.merge(pred_df, on='places')



In [None]:

# export datafiles 
places_gdf.to_file('Release risk model area places geodataframe.shp')  
pred_df.to_csv('Release Risk model predictions data.csv')


In [None]:

# import locations of documented PFAS release

ds_locations = pd.read_parquet('../../data/disposal_sites/PFAS_Sites_2021-11-07_geocoded.parquet')

# mismatch between ds_locations and places_gdf
ds_locations = ds_locations.drop(16, axis=0)


In [None]:

# Where have PFAS Releases occurred

fig = px.scatter_mapbox(
                  ds_locations,
                  lat='lat', lon='lon', 
                  hover_data=['RTN','Town','Site_Name','Address','Notif_Date','Disposition','Chemical'],
                  mapbox_style='carto-positron',   
                  zoom=7.4,
                  center={'lat': 42.05, 'lon': -71.6},
                  opacity=1,            
                  title='Locations of Documented PFAS Release'
              )

fig.update_traces(marker={'size': 8, 
                          'color':'#b2182b'})

fig.update_layout(
    autosize=False,
    margin={"r":0,"t":30,"l":0,"b":0},
    height=700,
    width=1000,                
  )

# set to False to enable interactive features: hover, scroll, zoom.  
config = {'staticPlot': False}
fig.show(config=config)

# save html output
if config['staticPlot'] == False:
  fig.write_html("PFAS release locations scatterplot map.html")



In [None]:

# Where have PFAS Releases occurred

fig = px.choropleth_mapbox(
                  places_gdf,
                  geojson=places_gdf['geometry'], 
                  locations=places_gdf.index, 
                  color="Release",
                  color_discrete_map={True:'#b2182b', # dark red
                                      False:'lightgray'},
                  mapbox_style='carto-positron',
                  # mapbox_style='white-bg',             
                  zoom=7.4,
                  center={'lat': 42.05, 'lon': -71.6},
                  opacity=0.7,                  
                  title='Areas Where PFAS Releases Have Occurred'
              )


fig2 = px.scatter_mapbox(
                  ds_locations,
                  lat='lat', lon='lon', 
                  hover_data=[ds_locations.index, 'RTN', 'Site_Name', 'Town'],
                  mapbox_style='carto-positron',          
              )
fig2.update_traces(marker={'size': 8, 'color':'black'})
fig.add_trace(fig2.data[0])


fig.update_layout(
    autosize=False,
    margin={"r":0,"t":30,"l":0,"b":0},
    height=700,
    width=1000,                
  )

fig.update_layout({
    'legend_title_text': 'Release Occurrence'},
    title=dict(font = dict(size = 18)), 
    legend=dict(font = dict(size = 18), 
                      orientation='h'), 
    legend_title=dict(font = dict(size = 18)))


# set to False to enable interactive features: hover, scroll, zoom.  
config = {'staticPlot': True}
fig.show(config=config)


In [None]:

# What areas were used for training the model

temp_gdf = places_gdf.loc[places_gdf['train_test']=='Training Set', ]

fig = px.choropleth_mapbox(
                  temp_gdf,
                  geojson=temp_gdf['geometry'], 
                  locations=temp_gdf.index, 
                  color="Release",
                  color_discrete_map={True:'#b2182b',
                                      False:'lightgray'},
                  mapbox_style='carto-positron',
                  # mapbox_style='white-bg',            
                  zoom=7.4,
                  center={'lat': 42.05, 'lon': -71.6},
                  opacity=0.7,                  
                  title='Model Training/Validation Regions'
              )

fig2 = px.scatter_mapbox(
                  ds_locations,
                  lat='lat', lon='lon', 
                  mapbox_style='carto-positron',          
              )
fig2.update_traces(marker={'size': 8, 'color':'black'})
fig.add_trace(fig2.data[0])


fig.update_layout(
    autosize=False,
    margin={"r":0,"t":30,"l":0,"b":0},
    height=700,
    width=1000,                
  )

fig.update_layout({
    'legend_title_text': 'Documented PFAS Release Occurred'},
    title=dict(font = dict(size = 18)), 
    legend=dict(font = dict(size = 18), 
                      orientation='h'), 
    legend_title=dict(font = dict(size = 18)))



# set to False to enable interactive features: hover, scroll, zoom.  
config = {'staticPlot': True}
fig.show(config=config)


In [None]:

# What areas were used for testing the model

temp_gdf = places_gdf.loc[places_gdf['train_test']=='Test Set', ]

fig = px.choropleth_mapbox(
                  temp_gdf,
                  geojson=temp_gdf['geometry'], 
                  locations=temp_gdf.index, 
                  color="Release",
                  color_discrete_map={True:'#b2182b',
                                      False:'lightgray'},
                  mapbox_style='carto-positron',
                  # mapbox_style='white-bg',            
                  zoom=7.4,
                  center={'lat': 42.05, 'lon': -71.6},
                  opacity=0.7,                  
                  title='Model Testing Regions'
              )
fig2 = px.scatter_mapbox(
                  ds_locations,
                  lat='lat', lon='lon', 
                  mapbox_style='carto-positron',          
              )
fig2.update_traces(marker={'size': 6, 'color':'black'})
fig.add_trace(fig2.data[0])


fig.update_layout(
    autosize=False,
    margin={"r":0,"t":30,"l":0,"b":0},
    height=700,
    width=1000,                
  )

fig.update_layout({
    'legend_title_text': 'Documented PFAS<br>Release Occurred'},
    title=dict(font = dict(size = 18)), 
    legend=dict(font = dict(size = 18), 
                      orientation='h'), 
    legend_title=dict(font = dict(size = 18)))


# set to False to enable interactive features: hover, scroll, zoom.  
config = {'staticPlot': True}
fig.show(config=config)


In [None]:

# plot choropleth of prediction outcomes for positive response areas

temp_gdf = places_gdf.loc[places_gdf['Release']==True, ]

fig = px.choropleth_mapbox(
                  temp_gdf,
                  geojson=temp_gdf['geometry'], 
                  locations=temp_gdf.index, 
                  color="Prediction Outcome",
                  labels={'Prediction Outcome':'Model Prediction Outcome'},
                  color_discrete_map={'Release(s) Occurred:  Correctly Predicted': '#b2182b', 
                                      'No Documented Release:  Higher Risk Predicted':'#b2182b',  
                                      'No Documented Release:  Lower Risk Predicted': '#bababa', 
                                      'Release(s) Occurred:  Model Failed to Predict':'#404040', 
                                      },
                  mapbox_style='carto-positron',
                  # mapbox_style='white-bg',             
                  zoom=7.4,
                  center={'lat': 42.05, 'lon': -71.6},
                  opacity=0.7,                  
                  title='Model Prediction Outcomes:  Areas of Documented PFAS Release'
              )

fig2 = px.scatter_mapbox(
                  ds_locations,
                  lat='lat', lon='lon', 
                  mapbox_style='carto-positron',          
              )
fig2.update_traces(marker={'size': 6, 'color':'black'})
fig.add_trace(fig2.data[0])


fig.update_layout(
    autosize=False,
    margin={"r":0,"t":30,"l":0,"b":0},
    height=700,
    width=1000,                
  )

fig.update_layout({
    'legend_title_text': ''}, 
    title=dict(font = dict(size = 18)), 
    legend=dict(font = dict(size = 18), 
                      orientation='h'), 
    legend_title=dict(font = dict(size = 18)))

# set to False to enable interactive features: hover, scroll, zoom.  
config = {'staticPlot': True}
fig.show(config=config)


In [None]:

# plot choropleth of prediction outcomes for negative response areas

temp_gdf = places_gdf.loc[places_gdf['Release']==False, ]

fig = px.choropleth_mapbox(
                  temp_gdf,
                  geojson=temp_gdf['geometry'], 
                  locations=temp_gdf.index, 
                  color="Prediction Outcome",
                  labels={'Prediction Outcome':'Model Prediction Outcome'},
                  color_discrete_map={'Release(s) Occurred:  Correctly Predicted': '#ca0020', 
                                      'No Documented Release:  Higher Risk Predicted':'#feb24c',
                                      'No Documented Release:  Lower Risk Predicted': '#bababa', 
                                      'Release(s) Occurred:  Model Failed to Predict':'#404040', 
                                      },
                  mapbox_style='carto-positron',
                  # mapbox_style='white-bg',                  
                  zoom=7.4,
                  center={'lat': 42.05, 'lon': -71.6},
                  opacity=0.9,                  
                  title='Model Prediction Outcomes:  Areas Without Documented PFAS Release'
              )

# fig2 = px.scatter_mapbox(
#                   ds_locations,
#                   lat='lat', lon='lon', 
#                   mapbox_style='carto-positron',          
#               )
# fig2.update_traces(marker={'size': 6, 'color':'black'})
# fig.add_trace(fig2.data[0])



fig.update_layout(
    autosize=False,
    margin={"r":0,"t":30,"l":0,"b":0},
    height=700,
    width=1000,                
  )
fig.update_layout({
    'legend_title_text': ''},
    title=dict(font = dict(size = 18)), 
    legend=dict(font = dict(size = 18), 
                      orientation='h'), 
    legend_title=dict(font = dict(size = 18)))



# set to False to enable interactive features: hover, scroll, zoom.  
config = {'staticPlot': True}
fig.show(config=config)


In [None]:

# plot choropleth of predicted probability of release

temp_gdf = places_gdf.loc[places_gdf['Release']==False, ]
places_gdf['prob'] = np.round(places_gdf['prob'], 2)

fig = px.choropleth_mapbox(
                  places_gdf,
                  geojson=places_gdf['geometry'], 
                  locations=places_gdf.index, 
                  color="prob",
                  hover_data=['Release','Prediction Outcome', 'prob'],
                  color_continuous_scale="reds",
                  mapbox_style='carto-positron',
                  # mapbox_style='white-bg',                  
                  zoom=7.4,
                  center={'lat': 42.05, 'lon': -71.6},
                  opacity=0.7,  
                  title='PFAS Release Risk: Predicted Probability'
              )

fig2 = px.scatter_mapbox(
                  ds_locations,
                  lat='lat', lon='lon', 
                  mapbox_style='carto-positron',          
              )
fig2.update_traces(marker={'size': 6, 'color':'black'})
fig.add_trace(fig2.data[0])

fig.update_layout(
    title=dict(font = dict(size = 18)), 
    legend=dict(font = dict(size = 18), 
                      orientation='h'), 
    legend_title=dict(font = dict(size = 18)))

fig.update_layout(
    autosize=False,
    margin={"r":0,"t":30,"l":0,"b":0},
    height=700,
    width=1100,
    coloraxis_colorbar={
        'title':'Predicted<br>Probability'})



# set to False to enable interactive features: hover, scroll, zoom.  
config = {'staticPlot': True}
fig.show(config=config)

          
fig.write_html("Release Risk Model Risk Probability Map.html")



# Visualizations for presenting

In [None]:

# plot prediction results 

plot_df = pred_df.sort_values('prob', ascending=True)
plot_df = plot_df.reset_index(drop=True)

font = {'size'   : 18}
plt.rc('font', **font)

fig = plt.figure(figsize=(8,6))

pal = {'Release(s) Occurred:  Correctly Predicted': 'green',    
       'No Documented Release:  Higher Risk Predicted':'red',  
       'No Documented Release:  Lower Risk Predicted': '#bababa',  
       'Release(s) Occurred:  Model Failed to Predict':'black'}  

plt.axhline(.5, linestyle='--', linewidth=1, color='green', alpha=.6)
sns.scatterplot(plot_df.index, plot_df['prob'], hue=plot_df['Prediction Outcome'], palette=pal, s=60, alpha=.8)

plt.ylim(0,1), 
plt.yticks(np.arange(0, 1.1, .1))
plt.grid(which='major', axis='y', linewidth=.3)

plt.xlabel('Locations - ordered index')
plt.ylabel('Predicted Probability')
plt.title('Model Prediction Outcomes')
# plt.legend(bbox_to_anchor=(1.01, 1), loc='upper left', markerscale=1.5);  # right side
plt.legend(bbox_to_anchor=(-0.03, -.2), loc='upper left', markerscale=1.5); # below chart



In [None]:


pal = {'Release(s) Occurred:  Correctly Predicted': 'green',    # dark red
       'No Documented Release:  Higher Risk Predicted':'red',  # orange
       'No Documented Release:  Lower Risk Predicted': '#bababa',  # light gray
       'Release(s) Occurred:  Model Failed to Predict':'black'}  # dark gray

# plot prediction outcome totals 
ax = sns.countplot(data=pred_df, 
                   y='Prediction Outcome', 
                   order=['Release(s) Occurred:  Correctly Predicted',
                          'No Documented Release:  Higher Risk Predicted',
                          'No Documented Release:  Lower Risk Predicted',
                          'Release(s) Occurred:  Model Failed to Predict'],
                   palette=pal)



plt.bar_label(ax.containers[0])
plt.xlim(0,30)
plt.ylabel('')
plt.title('')
plt.show()

