In [None]:
# Import necessary libraries

#! pip install pycaret full 
import pycaret
#! pip install pandas
from pycaret.regression import *
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Import dataset & choosing desired parameters to analyze
df = pd.read_csv("Database.csv")
df = df[['Micro Pollutant', 'MP Category', 'Initial Concentration of MP (mg/L)', 'Compound MW (g/mol)', 'Compound Charge', 'Initial FS pH', 'Initial DS pH','Type of MB', 'MB Contact Angle (°)', 'MB Zeta Potential (mV)', 'Draw Solution', 'DS MW (g mol-1)', 'DS Concentration (M)', 'Operating Time (h)', 'Cross Flow Velocity (cm/s)','Water Flux, Jw (LMH)','Reverse Salt Flux, Js (gMH)','Temperature (⁰C)', 'Removal Rate (%)']]
df

In [None]:
# Exploratory Data Analysis (EDA)
# Describing main statistical features of parameters in dataset
df.describe()

In [None]:
# EDA: Different micro pollutant categories observed 
sns.set(font_scale=3)
category_order = df['MP Category'].value_counts().sort_values(ascending=False).index

ax=sns.catplot(x='MP Category' ,kind="count",data=df,order=category_order, height=10, aspect=2)

plt.xticks(rotation=90)
plt.yticks(rotation=90)
plt.figure(figsize=(25,25))


In [None]:
# Setting up the model by choosing water flux (Jw) as the prediction target

setup(data = df,target = 'Water Flux, Jw (LMH)',session_id = 123)

In [None]:
# Plotting correlation between parameters in dataset

plt.figure(figsize=(25,25))
my_correlation = df.corr()
sns.heatmap(my_correlation,annot=True, cmap='coolwarm',annot_kws={"size": 16})
ax = plt.gca()
ax.tick_params(axis='x', labelsize=20)
ax.tick_params(axis='y', labelsize=20)
ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
ax.set_yticklabels(ax.get_yticklabels(), rotation=360)
cbar = plt.gcf().axes[-1]
cbar.tick_params(labelsize=16)


In [None]:
# Refining the dataset removing inputs with very hitg correlation

df = df[['Micro Pollutant', 'MP Category', 'Initial Concentration of MP (mg/L)', 'Compound MW (g/mol)', 'Compound Charge', 'Initial FS pH','Type of MB', 'MB Contact Angle (°)', 'Draw Solution', 'DS MW (g mol-1)', 'DS Concentration (M)', 'Operating Time (h)', 'Cross Flow Velocity (cm/s)','Water Flux, Jw (LMH)','Reverse Salt Flux, Js (gMH)','Temperature (⁰C)', 'Removal Rate (%)']]



In [None]:
# Plotting correlation between parameters in modified dataset
plt.figure(figsize=(25,25))
my_correlation = df.corr()
sns.heatmap(my_correlation,annot=True, cmap='coolwarm',annot_kws={"size": 16})
ax = plt.gca()
ax.tick_params(axis='x', labelsize=20)
ax.tick_params(axis='y', labelsize=20)
ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
ax.set_yticklabels(ax.get_yticklabels(), rotation=360)
cbar = plt.gcf().axes[-1]
cbar.tick_params(labelsize=16)

In [None]:
# One-hot encoding categorical parameters for ML
# Not needed as PyCaret takes care of it
#MP=pd.get_dummies(df['Micro Pollutant'])
#MP_Cat=pd.get_dummies(df['MP Category'])
#MB=pd.get_dummies(df['Type of MB'])
#DS=pd.get_dummies(df['Draw Solution'])
#df = pd.concat([df,MP,MP_Cat, MB,DS],axis=1)
#df.to_csv('modified_dataset.csv')

In [None]:
# Auto ML using PyCaret with all regression models available
best = compare_models() 

In [None]:
# Save the table to an Excel file
#table = pd.DataFrame(best)
#table.to_excel('model_performance.xlsx', index=False)

In [None]:
 # Creating the ML model based on best models compared above: Gradient Boosting Regress
gbr=create_model('gbr')

In [None]:
# Tuning GBR model parameters with 50 times of itterations
tuned_gbr = tune_model(gbr, choose_better = True, n_iter = 50)

In [None]:
# Saving the GBR model to use in the app
save_model(gbr, model_name = 'GradientBoostingRegressor')

In [None]:
# Plotting prediction accuracy of GBR
plot_model(gbr)

In [None]:
plot_model(gbr, plot = 'error')

In [None]:
 # Creating the ML model based on best models compared above: Extreme Gradient Boosting 

xgb = create_model('xgboost')

In [None]:
# Plotting prediction accuracy of XGB

plot_model(xgb)
plot_model(xgb, plot = 'error')

In [None]:
interpret_model(xgb)

In [None]:
# Creating the ML model based on best models compared above: Random Forest

rf = create_model('rf')

# Plotting prediction accuracy of RF
plot_model(rf)
plot_model(rf, plot = 'error')

In [None]:
# Feature importance plot by GBR

plot_model(gbr, plot = 'feature')


In [None]:
interpret_model(rf)
plot_model(rf, plot = 'feature')

In [None]:
# Plot SHAP value (impact on model output) plot
interpret_model(xgb)

In [None]:
# Setting up a new model by choosing Rejection rate (R) as the prediction target
setup(data = df, target = 'Removal Rate (%)' ,session_id = 123 ) 

#optional commands to pass to the "setup()":
#remove_outliers = True
# feature_selection = True

In [None]:
# Plotting correlation between parameters in modified dataset
plt.figure(figsize=(25,25))
my_correlation = df.corr()
sns.heatmap(my_correlation,annot=True, cmap='coolwarm',annot_kws={"size": 16})
ax = plt.gca()
ax.tick_params(axis='x', labelsize=20)
ax.tick_params(axis='y', labelsize=20)
ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
ax.set_yticklabels(ax.get_yticklabels(), rotation=360)
cbar = plt.gcf().axes[-1]
cbar.tick_params(labelsize=16)

In [None]:
# Auto ML using PyCaret with all regression models available
best = compare_models() 

In [None]:
 # Creating the ML model based on best models compared above: Gradient Boosting Regress
gbr=create_model('gbr')

In [None]:
tuned_gbr = tune_model(gbr, choose_better = True, n_iter = 50)

In [None]:
# Plotting prediction accuracy of GBR
plot_model(tuned_gbr)

In [None]:
plot_model(tuned_gbr, plot = 'error')

In [None]:
plot_model(tuned_xgb, plot = 'auc')

In [None]:
 # Creating the ML model based on best models compared above: Extreme Gradient Boosting 

xgb = create_model('xgboost')

In [None]:
# Tuning GBR model parameters with 50 times of itterations
tuned_xgb = tune_model(xgb, choose_better = True, n_iter = 50)

In [None]:
# Plotting prediction accuracy of XGB

plot_model(tuned_xgb)

In [None]:
# Plotting prediction error of tuned XGB
plot_model(tuned_xgb, plot = 'error')

In [None]:
# Plot SHAP value (impact on model output) plot by XGB
interpret_model(xgb)

In [None]:
# Creating the ML model based on best models compared above: Random Forest
rf = create_model('rf')

In [None]:
# Plotting prediction accuracy of XGB
plot_model(xgb)

In [None]:
# Plotting prediction accuracy of RF
plot_model(rf)

In [None]:
# Plotting prediction error of RF
plot_model(rf, plot = 'error')

In [None]:
# Plot SHAP value (impact on model output) plot by RF
interpret_model(rf)

In [None]:
svr = create_model('svm')

In [None]:
# Plotting prediction accuracy of SVR
plot_model(svr)

In [None]:
# Plotting prediction error of SVR
plot_model(svr, plot = 'error')

In [None]:
# Feature importance plot by GBR
plot_model(gbr, plot = 'feature')

In [None]:
# Feature importance plot by RF
plot_model(rf, plot = 'feature')

In [None]:
#evaluate_model(xgb)
