# HEHE

## Data Cleaning

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv("/Users/crystalcaves792/Desktop/all desktop/water ml/BKB_WaterQualityData_2020084.csv")

In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
df.head(20)

In [None]:
df['Site_Id'].unique()

In [None]:
df.drop(['Unit_Id', 'Field_Tech', 'DateVerified', 'WhoVerified', 'Air Temp-Celsius'], axis = 1, inplace=True)
df.isna().sum()

## Correlation Matrix

In [None]:
df_corr = df.select_dtypes(include = 'number')
df_corr.drop('Year', axis = 1, inplace=True)
df_corr.head(5)

In [None]:
import matplotlib.pyplot as plt

plt.matshow(df_corr.corr())
plt.show()

In [None]:
import numpy as np

corr = df_corr.corr()
corr.style.background_gradient(cmap='coolwarm')

## Creating a water quality dummy

Keep in my that this water quality variable is based on whether a certain body of water is drinkable, since various use cases may have different standards. The drinkability of water is measured in pH (according to my search), while other variables are determinant of living conditions for aquatic organisms and clarity. Furthermore, the dataset was taken from the U.S. Department of the Interior, which means that using the U.S. standard for salinity is most sensible in this case. 

In [None]:
df_corr['Salinity (ppt)'].value_counts().loc[lambda x: x.index <= 0.5].sum()

In [None]:
df.head(10)

Here is the U.S. standard that make water safe to drink:
- Salinity < 0.5ppt [Source]([http://url_here](https://www.horiba.com/int/water-quality/applications/water-wastewater/measuring-salinity-of-water/))

In [None]:
df['Drinkability'] = (df['Salinity (ppt)'] < 0.5).astype(int)
df['Drinkability'].value_counts()
# 0 for non-drinkable, and 1 for drinkable

## KNN for Water Quality Prediction

In [None]:
df = df.dropna()
df_ml = df.select_dtypes(include = 'number')
df_ml.drop('Year', axis = 1, inplace = True)

In [None]:
X = df_ml.drop(columns=['Drinkability'])
y = df_ml['Drinkability']

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifier.fit(X_train, y_train)

In [None]:
y_pred = classifier.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix,accuracy_score
cm = confusion_matrix(y_test, y_pred)
ac = accuracy_score(y_test,y_pred)
print(cm)
print(ac)

### K Fold

In [None]:
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score

In [None]:
knn_cv = KNeighborsClassifier(n_neighbors=3)
#train model with cv of 5 
cv_scores = cross_val_score(knn_cv, X, y, cv=5)
#print each cv score (accuracy) and average them
print(cv_scores)

### Hyperparameter tuning

In [None]:
from sklearn import metrics
mean_acc = np.zeros(20)
for i in range(1,21):
    #Train Model and Predict  
    knn = KNeighborsClassifier(n_neighbors = i).fit(X_train,y_train)
    yhat= knn.predict(X_test)
    mean_acc[i-1] = metrics.accuracy_score(y_test, yhat)

mean_acc

In [None]:
loc = np.arange(1,21,step=1.0)
plt.figure(figsize = (10, 6))
plt.plot(range(1,21), mean_acc)
plt.xticks(loc)
plt.xlabel('Number of Neighbors ')
plt.ylabel('Accuracy')
plt.show()

## Decision Tree

In [None]:
from sklearn import tree

In [None]:
X2 = df_ml.drop(columns=['Drinkability', 'Salinity (ppt)']) #removing salinity to avoid leakage (accuracy = 1 with it)
y2 = df_ml['Drinkability']

In [None]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size = 0.3, random_state=42)

In [None]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train2, y_train2)

In [None]:
y_pred2 = clf.predict(X_test2)
print("Accuracy:",metrics.accuracy_score(y_test2, y_pred2))

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
X3 = df_ml.drop(columns=['Drinkability', 'Salinity (ppt)'])
y3 = df_ml['Drinkability']

In [None]:
X_train3, X_test3, y_train3, y_test3 = train_test_split(X3, y3, test_size=0.2)

In [None]:
rf = RandomForestClassifier()
rf.fit(X_train3, y_train3)
y_pred3 = rf.predict(X_test3)
accuracy3 = accuracy_score(y_test3, y_pred3)
print("Accuracy:", accuracy3)

### Hyperparameter tuning

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
param_dist = {'n_estimators': randint(50,500),
              'max_depth': randint(1,20)}

# Create a random forest classifier
rf = RandomForestClassifier()

# Use random search to find the best hyperparameters
rand_search = RandomizedSearchCV(rf, 
                                 param_distributions = param_dist, 
                                 n_iter=5, 
                                 cv=5)

# Fit the random search object to the data
rand_search.fit(X_train3, y_train3)

In [None]:
best_rf = rand_search.best_estimator_

# Predict on test set
y_pred3_5 = best_rf.predict(X_test3)

accuracy3_5 = accuracy_score(y_test3, y_pred3_5)
print("Accuracy:", accuracy3_5)

## Logistic regression

In [None]:
X4 = df_ml.drop(columns=['Drinkability', 'Salinity (ppt)'])
y4 = df_ml['Drinkability']

X_train4, X_test4, y_train4, y_test4 = train_test_split(X4, y4, test_size=0.25)

In [None]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(random_state=16)
logreg.fit(X_train4, y_train4)
y_pred4 = logreg.predict(X_test4)
metrics.confusion_matrix(y_test4, y_pred4)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test4, y_pred4))

In [None]:
y_pred_proba = logreg.predict_proba(X_test4)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test4,  y_pred_proba)
auc = metrics.roc_auc_score(y_test4, y_pred_proba)
plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
plt.legend(loc=4)
plt.show()