In [19]:
import pandas as pd

import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer


df = pd.read_excel('PFAS_Data.xlsx')
t1 = []

labels = []
sources =  []


features = ['Well depth', 'Farms', 'Farmland', 'PP_1km', 'PP_4km', 'PP_12km','AFFF', 'KP_50km', 'Nearest_KP', 'AFF_Acc', 'Basin', 'KP_Name', 'AFFF_Con']
for i in range(0, 130):



    labels.append(int(df.loc[i]['Split']))
    sources.append(df.loc[i]['County'])


    numeric_row = pd.to_numeric(df.loc[i].tolist()[2:15], errors='coerce')

    t1.append(numeric_row.tolist())

print(df['Split'].value_counts())
print(len(t1))

Split
1    65
0    65
Name: count, dtype: int64
130


In [2]:
from sklearn .metrics import roc_auc_score, f1_score, recall_score, precision_score

In [7]:

X_train, X_test, y_train, y_test = train_test_split(t1, labels, test_size=0.25, random_state=42)



rf = RandomForestClassifier(n_estimators=100, random_state=42)

rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)
auc = np.round(roc_auc_score(y_test, y_pred), 10)
print(f'Auroc: {auc}')
f1 = np.round(f1_score(y_test, y_pred), 10)
print(f'f1: {f1}')
recall = np.round(recall_score(y_test, y_pred), 10)
print(f'recall: {recall}')
precision = np.round(precision_score(y_test, y_pred, zero_division=1), 10)
print(f'precision: {precision}')
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')



Auroc: 0.9116541353
f1: 0.9189189189
recall: 0.8947368421
precision: 0.9444444444
Accuracy: 90.91%


In [8]:

importances = rf.feature_importances_
std = list(np.std([tree.feature_importances_ for tree in rf.estimators_], axis=0))
e = dict((n, l) for n, l in zip(features, std))
print(f'{len(std)} features')
sorted_dict = {k: round((v *100), 0)  for k, v in reversed(sorted(e.items(), key=lambda item: item[1]))}
print('Ranked importances', sorted_dict)

11 features
Ranked importances {'Well depth': 15.0, 'PP_4km': 14.0, 'AFF_Acc': 13.0, 'KP_50km': 9.0, 'Nearest_KP': 9.0, 'PP_12km': 8.0, 'AFFF': 8.0, 'Farms': 7.0, 'Basin': 5.0, 'PP_1km': 5.0, 'Farmland': 2.0}


In [10]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
X_train, X_test, y_train, y_test = train_test_split(t1, labels, test_size=0.3, random_state=42)
model = xgb.XGBClassifier(n_estimators=100, eval_metric='logloss')
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

auc = np.round(roc_auc_score(y_test, y_pred), 10)
print(f'Auroc: {auc}')
f1 = np.round(f1_score(y_test, y_pred), 10)
print(f'f1: {f1}')
recall = np.round(recall_score(y_test, y_pred), 10)
print(f'recall: {recall}')
precision = np.round(precision_score(y_test, y_pred, zero_division=1), 10)
print(f'precision: {precision}')
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')


Auroc: 0.8650793651
f1: 0.8888888889
recall: 0.9523809524
precision: 0.8333333333
Accuracy: 87.18%


In [18]:
from sklearn.naive_bayes import GaussianNB


model = GaussianNB()
model.fit(X_train, y_train)
predictions = model.predict(X_test)

auc = np.round(roc_auc_score(y_test, y_pred), 10)
print(f'Auroc: {auc}')
f1 = np.round(f1_score(y_test, y_pred), 10)
print(f'f1: {f1}')
recall = np.round(recall_score(y_test, y_pred), 10)
print(f'recall: {recall}')
precision = np.round(precision_score(y_test, y_pred, zero_division=1), 10)
print(f'precision: {precision}')
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

Auroc: 0.7261904762
f1: 0.7027027027
recall: 0.619047619
precision: 0.8125
Accuracy: 71.79%


In [12]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=100)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

auc = np.round(roc_auc_score(y_test, y_pred), 10)
print(f'Auroc: {auc}')
f1 = np.round(f1_score(y_test, y_pred), 10)
print(f'f1:{f1}')
recall = np.round(recall_score(y_test, y_pred), 10)
print(f'recall: {recall}')
precision = np.round(precision_score(y_test, y_pred, zero_division=1), 10)
print(f'precision: {precision}')
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

Auroc: 0.8650793651
f1:0.8888888889
recall: 0.9523809524
precision: 0.8333333333
Accuracy: 87.18%


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [13]:
!pip3 install catboost

Collecting catboost
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


In [14]:
from catboost import CatBoostClassifier
model = CatBoostClassifier(loss_function='Logloss')
model.fit(X_train, y_train, verbose=100)
y_pred = model.predict(X_test)

auc = np.round(roc_auc_score(y_test, y_pred), 10)
print(f'Auroc: {auc}')
f1 = np.round(f1_score(y_test, y_pred), 10)
print(f'f1:{f1}')
recall = np.round(recall_score(y_test, y_pred), 10)
print(f'recall: {recall}')
precision = np.round(precision_score(y_test, y_pred, zero_division=1), 10)
print(f'precision: {precision}')
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

Learning rate set to 0.003702
0:	learn: 0.6902654	total: 50.1ms	remaining: 50.1s
100:	learn: 0.4953176	total: 285ms	remaining: 2.53s
200:	learn: 0.3762995	total: 740ms	remaining: 2.94s
300:	learn: 0.3020520	total: 1.14s	remaining: 2.65s
400:	learn: 0.2497505	total: 1.42s	remaining: 2.12s
500:	learn: 0.2104630	total: 1.88s	remaining: 1.87s
600:	learn: 0.1815245	total: 2.12s	remaining: 1.41s
700:	learn: 0.1590268	total: 2.51s	remaining: 1.07s
800:	learn: 0.1392618	total: 2.96s	remaining: 735ms
900:	learn: 0.1233731	total: 3.43s	remaining: 377ms
999:	learn: 0.1099308	total: 3.8s	remaining: 0us
Auroc: 0.9246031746
f1:0.9268292683
recall: 0.9047619048
precision: 0.95
Accuracy: 92.31%


In [15]:
!pip install dask-expr

Collecting dask-expr
  Downloading dask_expr-1.1.13-py3-none-any.whl.metadata (2.5 kB)
Collecting dask==2024.8.2 (from dask-expr)
  Downloading dask-2024.8.2-py3-none-any.whl.metadata (3.7 kB)
Collecting cloudpickle>=3.0.0 (from dask==2024.8.2->dask-expr)
  Downloading cloudpickle-3.0.0-py3-none-any.whl.metadata (7.0 kB)
Downloading dask_expr-1.1.13-py3-none-any.whl (242 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m242.6/242.6 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dask-2024.8.2-py3-none-any.whl (1.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m16.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading cloudpickle-3.0.0-py3-none-any.whl (20 kB)
Installing collected packages: cloudpickle, dask, dask-expr
  Attempting uninstall: cloudpickle
    Found existing installation: cloudpickle 2.2.1
    Uninstalling cloudpickle-2.2.1:
      Successfully uninstalled cloudpickle-2.2.1
  Attempting uninstall: dask
    F

In [16]:
import lightgbm as lgb

model = lgb.LGBMClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

auc = np.round(roc_auc_score(y_test, y_pred), 10)
print(f'Auroc: {auc}')
f1 = np.round(f1_score(y_test, y_pred), 10)
print(f'f1:{f1}')
recall = np.round(recall_score(y_test, y_pred), 10)
print(f'recall: {recall}')
precision = np.round(precision_score(y_test, y_pred, zero_division=1), 10)
print(f'precision: {precision}')
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

[LightGBM] [Info] Number of positive: 44, number of negative: 47
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000316 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 157
[LightGBM] [Info] Number of data points in the train set: 91, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.483516 -> initscore=-0.065958
[LightGBM] [Info] Start training from score -0.065958
Auroc: 0.9484126984
f1:0.9523809524
recall: 0.9523809524
precision: 0.9523809524
Accuracy: 94.87%


In [17]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
auc = np.round(roc_auc_score(y_test, y_pred), 10)
print(auc)
f1 = np.round(f1_score(y_test, y_pred), 10)
print(f1)
recall = np.round(recall_score(y_test, y_pred), 10)
print(recall)
precision = np.round(precision_score(y_test, y_pred), 10)
print(precision)


Accuracy: 0.717948717948718
0.7261904762
0.7027027027
0.619047619
0.8125
