## Water Quality Prediction

### Participants
#### Abhinav Rakesh Kumar Shukla (20015525)
#### Rohit Saji ()
#### Sidharth Suhas Dalvi ()
#### Venu Prasath Mohan (20015850)

## Importing required libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import AdaBoostClassifier
from imblearn.over_sampling import SMOTE
import seaborn as sn
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
import warnings
warnings.filterwarnings("ignore")


## Elementary Data Analysis

In [None]:
water_quality_data = pd.read_csv('waterQuality1.csv', na_values='#NUM!')

In [None]:
water_quality_data

In [None]:
print('Total number of samples: ', len(water_quality_data))

In [None]:
print('Number of features: ', len(water_quality_data.columns)-1)

In [None]:
water_quality_data.isna().any()

In [None]:
sn.heatmap(water_quality_data.isna(), cmap='coolwarm')

In [None]:
water_quality_data.dropna(inplace=True)

In [None]:
print('Total number of samples after dropping NaN values: ', len(water_quality_data))


In [None]:
water_quality_data.describe()

### Distribution of features

In [None]:
plt.figure(figsize=(15,15))
columns = list(water_quality_data.columns[:-1])
for i in range(0, len(columns)):
  plt.subplot(5,4,i+1)
  plt.boxplot(water_quality_data[columns[i]])
  plt.title(columns[i])
plt.show()

### Distribution of Labels

In [None]:
water_quality_data['is_safe'] = water_quality_data['is_safe'].astype('int64')
unique_labels = water_quality_data['is_safe'].value_counts()

plt.bar(list(unique_labels.index), unique_labels.values)
plt.title('Distribution of labels')
plt.show()

In [None]:
sn.pairplot(data=water_quality_data, vars=columns[:3], hue='is_safe')
plt.show()

In [None]:
unique_labels.index

### Normalization

1. MinMax Normalization
2. Z-Score

In [None]:
# Min Max Norm
min_max = MinMaxScaler()
water_quality_data_mm = water_quality_data.copy()
water_quality_data_mm[columns] = min_max.fit_transform(water_quality_data_mm[columns])

In [None]:
plt.figure(figsize=(8,8))
for cols in water_quality_data_mm.columns[:-1]:
  sn.kdeplot(water_quality_data_mm[cols])
  plt.legend(water_quality_data_mm.columns)
  plt.xlabel('Values')
plt.show()

In [None]:
# Z-Score Norm
z_score = StandardScaler()
water_quality_data_z = water_quality_data.copy()
water_quality_data_z[columns] = z_score.fit_transform(water_quality_data_z[columns]) 

In [None]:
plt.figure(figsize=(8,8))
for cols in water_quality_data_z.columns[:-1]:
  sn.kdeplot(water_quality_data_z[cols])
  plt.legend(water_quality_data_z.columns)
  plt.xlabel('Values')
plt.show()

# Model

1. SVM
      - SVM with MinMax Norm
      - SVM with Z-Score

2. Adaboost
      - Adaboost with MinMax Norm
      - Adaboost with Z-Score

3. KNN
      - KNN with MinMax Norm
      - KNN with Z-Score

4. Random forest with gridsearch CV
      - Random forest with gridsearch CV with MinMax Norm
      - 
5. Decision Tree
    - Decision Tree with MinMax Norm
    - Decision Tree with Z-Score
6. Bagging along with Random Forest
    - Bagging along with Random Forest with MinMax Norm
    - Bagging along with Random Forest with Z-Score
7. Logistic Regression
    - Logistic Regression with MinMax Norm
    - Logistic Regression with Z-Score
    - Logistic Regression with Grid Search CV
    - Weighted Logistic Regression
8. Gradient Boosting
    - Gradient Boosting with MinMax Norm
    - Gradient Boosting with Z-Score
    - Gradient Boosting with Grid Search CV


### Defining Metrics Calculation / Splitting Data

In [None]:
# Function to split data

def split_data(data, columns):
  features = data[columns]
  labels = data['is_safe']
  X_train, X_test, y_train, y_test = train_test_split(features, labels, random_state=42, test_size=0.3)
  return X_train, X_test, y_train, y_test

In [None]:
def print_metrics(model_name, scaler, y_test, y_pred):
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy of {} with {}: {} ({:.2%})".format(model_name, scaler, accuracy, accuracy))
    print()
    print("Error rate: {} ({:.2%})".format(1-accuracy, 1-accuracy))
    print()
    print(classification_report(y_test, y_pred))
    print_confusion_matrix(y_test, y_pred)
    roc_cur(y_test, y_pred)

In [None]:
def print_confusion_matrix(y_test, y_pred):
    confusion_matrix = metrics.confusion_matrix(y_test, y_pred)
    sn.heatmap(confusion_matrix, annot=True, cmap='Blues', fmt='g')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title('Confusion Matrix')
    plt.show()

In [None]:
def roc_cur(y_test, y_pred):
  fp, tp, thresh = metrics.roc_curve(y_test, y_pred)
  curve = metrics.auc(fp, tp)
  plt.plot(fp, tp)
  plt.plot([0,1], [1,0], 'k--')
  plt.xlabel('False Postive')
  plt.ylabel('True Postive')
  plt.legend(loc='lower right')

## Support Vector Machine (SVM)

### SVM with MinMax Norm

In [None]:
# SVM Classifier with MinMax 
svm = SVC(kernel='rbf')

In [None]:
X_train, X_test, y_train, y_test = split_data(water_quality_data_mm, columns)

In [None]:
svm.fit(X_train, y_train)

In [None]:
y_pred = svm.predict(X_test)

In [None]:
print_metrics("SVM", "MinMax normalization", y_test, y_pred)

### SVM with Z-Score

In [None]:
# SVM Classifier with Z-score
svm = SVC(kernel='rbf')

In [None]:
X_train, X_test, y_train, y_test = split_data(water_quality_data_z, columns)

In [None]:
svm.fit(X_train, y_train)

In [None]:
y_pred_2 = svm.predict(X_test)

In [None]:
print_metrics("SVM", "Z-Score normalization", y_test, y_pred_2)

### SVM with Grid Search CV

In [None]:
parameters = {'C': [0.1, 1, 10], 'gamma': ['scale', 'auto'], 'kernel': ['linear', 'rbf', 'sigmoid']}

svm = SVC(kernel='rbf')
grid_svm = GridSearchCV(svm, param_grid=parameters)
grid_svm.fit(X_train, y_train)

In [None]:
print('Best Parameters: ', grid_svm.best_params_)
best_param = grid_svm.best_params_

In [None]:
# Predicting on testing data with new hyperparams
svm = SVC(kernel=best_param['kernel'], C=best_param['C'], gamma=best_param['gamma'])
svm.fit(X_train, y_train)
y_pred_5 = svm.predict(X_test)
print_metrics("SVM", "GridSearchCV", y_test, y_pred_5)

## Adaboost

### Adaboost with MinMax Norm

In [None]:
adaboost = AdaBoostClassifier(n_estimators=150, random_state=42)

In [None]:
X_train, X_test, y_train, y_test = split_data(water_quality_data_mm, columns)

In [None]:
adaboost.fit(X_train, y_train)

In [None]:
y_pred_3 = adaboost.predict(X_test)

In [None]:
print_metrics("Adaboost", "MinMax normalization", y_test, y_pred_3)

### Adaboost with Z-Score

In [None]:
adaboost = AdaBoostClassifier(n_estimators=150, random_state=42)

In [None]:
X_train, X_test, y_train, y_test = split_data(water_quality_data_z, columns)

In [None]:
adaboost.fit(X_train, y_train)

In [None]:
y_pred_4 = adaboost.predict(X_test)

In [None]:
print_metrics("Adaboost", "Z-Score normalization", y_test, y_pred_4)

## K-Nearest Neighbours (KNN)

### KNN with Minmax norm

In [None]:
knn = KNeighborsClassifier(n_neighbors=3)
X_train, X_test, y_train, y_test = split_data(water_quality_data_mm, columns)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
accuracy = metrics.accuracy_score(y_test, y_pred)*100
print_metrics("KNN", "MinMax normalization", y_test, y_pred)

### KNN with Z-score

In [None]:
X_train, X_test, y_train, y_test = split_data(water_quality_data_z, columns)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
accuracy = metrics.accuracy_score(y_test, y_pred)*100
print_metrics("KNN", "Z-Score normalization", y_test, y_pred)

## Random forest with grid search CV

### with minmax norm

In [None]:
X_train, X_test, y_train, y_test = split_data(water_quality_data_mm, columns)
rf_model=RandomForestClassifier()
rf_model.fit(X_train, y_train)

In [None]:
n_estimators=[10, 17, 25, 33, 41, 48, 56, 64, 72, 80]
max_features=['auto','sqrt']
max_depth=[2,4,6]
param_grid={
    'n_estimators':n_estimators,
    'max_features':max_features,
    'max_depth':max_depth,
}
print(param_grid)
rf_Grid=GridSearchCV(estimator= rf_model, param_grid=param_grid,cv=3,verbose=2,n_jobs=4)

In [None]:
rf_Grid.fit(X_train,y_train)

In [None]:
print('Best Parameters')
print(rf_Grid.best_params_)

In [None]:
y_pred_rf = rf_Grid.predict(X_test)
print_metrics("Radom Forrest with Grid Search CV", "MinMax normalization", y_test, y_pred_rf)

### with z-score

In [None]:
X_train, X_test, y_train, y_test = split_data(water_quality_data_z, columns)
rf_model=RandomForestClassifier()
rf_model.fit(X_train, y_train)

In [None]:
n_estimators=[10, 17, 25, 33, 41, 48, 56, 64, 72, 80]
max_features=['auto','sqrt']
max_depth=[2,4,6]
param_grid={
    'n_estimators':n_estimators,
    'max_features':max_features,
    'max_depth':max_depth,
}
print(param_grid)
rf_Grid=GridSearchCV(estimator= rf_model, param_grid=param_grid,cv=3,verbose=2,n_jobs=4)

In [None]:
rf_Grid.fit(X_train,y_train)

In [None]:
print('Best Parameters')
print(rf_Grid.best_params_)

In [None]:
y_pred_rf = rf_Grid.predict(X_test)
print_metrics("Radom Forrest with Grid Search CV", "Z-Score normalization", y_test, y_pred_rf)

### Decision tree with MinMax Norm

In [None]:
decisionTreeModel = tree.DecisionTreeClassifier()
X_train, X_test, y_train, y_test = split_data(water_quality_data_mm, columns)
decisionTreeModel.fit(X_train, y_train)
decisionTree_y_pred = decisionTreeModel.predict(X_test)
accuracyDecisionTree = metrics.accuracy_score(y_test, decisionTree_y_pred)*100
print_metrics("Decision Tree", "MinMax normalization", y_test, decisionTree_y_pred)

### Decision tree with Z-score

In [None]:
decisionTreeModel_Z_Score = tree.DecisionTreeClassifier()
X_train, X_test, y_train, y_test = split_data(water_quality_data_z, columns)
decisionTreeModel_Z_Score.fit(X_train, y_train)
decisionTree_Z_Score_y_pred = decisionTreeModel_Z_Score.predict(X_test)
accuracyDecisionTree_Z_Score = metrics.accuracy_score(y_test, decisionTree_Z_Score_y_pred)*100
print_metrics("Decision Tree", "Z-Score normalization", y_test, decisionTree_Z_Score_y_pred)

## Bagging Along with Random Forest

### Bagging Along with Random Forest Classifier using MinMax Norm

In [None]:
randomForestClassifier = RandomForestClassifier(max_depth=5, random_state=42)
baggingClassifier = BaggingClassifier(base_estimator = randomForestClassifier, n_estimators = 10, random_state = 42)
X_train, X_test, y_train, y_test = split_data(water_quality_data_mm, columns)
baggingClassifier = baggingClassifier.fit(X_train, y_train)
Bagging_y_pred = baggingClassifier.predict(X_test)
accuracyBagging_MinMax = metrics.accuracy_score(y_test, Bagging_y_pred)*100
print_metrics("Random Forest", "MinMax normalization", y_test, Bagging_y_pred)

### Bagging along with Random Forest Classifier using Z Score

In [None]:
X_train, X_test, y_train, y_test = split_data(water_quality_data_z, columns)
baggingClassifier_Z = baggingClassifier.fit(X_train, y_train)
Bagging_y_pred_Z = baggingClassifier.predict(X_test)
accuracyBagging_Z = metrics.accuracy_score(y_test, Bagging_y_pred_Z)*100
print_metrics("Random Forest", "Z-Score normalization", y_test, Bagging_y_pred_Z)

## Logistic Regression

### Logistic Regression with Minmax norm

In [None]:
X_train, X_test, y_train, y_test = split_data(water_quality_data_mm, columns)
logRegModel = LogisticRegression()
logRegModel.fit(X_train, y_train)
accuracy = metrics.accuracy_score(y_test, y_pred)*100
print_metrics("Logistic Regression", "MinMax normalization", y_test, y_pred)

### Logistic Regression with z-score

In [None]:
X_train, X_test, y_train, y_test = split_data(water_quality_data_z, columns)
logRegModel = LogisticRegression()
logRegModel.fit(X_train, y_train)
accuracy = metrics.accuracy_score(y_test, y_pred)*100
print_metrics("Logistic Regression", "Z-Score normalization", y_test, y_pred)

### Grid Search CV for Logistic Regression

In [None]:
# Define the hyperparameters to search
hyperparameters = {
    'tol': [0.01, 0.001, 0.1],
    'penalty': ['l1', 'l2', 'none'],
    'max_iter' : [100, 1000,2500, 5000]
}

In [None]:
grid_search = GridSearchCV(logRegModel, hyperparameters, cv=5, verbose=True, n_jobs=-1)
a = grid_search.fit(X_train, y_train)

In [None]:
best_logreg = grid_search.best_estimator_
best_logreg

In [None]:
y_pred = best_logreg.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print_metrics("Logistic Regression", "with GridSearch CV", y_test, y_pred)

### Weighted Logistic Regression

In [None]:
print("Ratio of Discrete labels")
water_quality_data["is_safe"].value_counts()/water_quality_data["is_safe"].shape[0]

In [None]:
X_train, X_test, y_train, y_test = split_data(water_quality_data_z, columns)
weights = {0: 14, 1: 86}
logmodel = LogisticRegression(class_weight=weights)
logmodel.fit(X_train, y_train)
predictions = logmodel.predict(X_test)
accuracy = metrics.accuracy_score(y_test, predictions)*100
print_metrics("Weighted Logistic Regression", "Z-Score normalization", y_test, predictions)

* As you can see after using the weighted logistic regression the accuracy is brought down to 86% from 90%.
* However, the model is trained better for both True and False Labels

## Gradient Boosting

### Gradient Boosting with Min Max Normalized Data

In [None]:
X_train, X_test, y_train, y_test = split_data(water_quality_data_mm, columns)
model1 = GradientBoostingClassifier(loss='deviance', learning_rate=0.1, n_estimators=100, max_depth=3, random_state=42, 
                                        min_samples_split=2, min_samples_leaf=1, subsample=1.0, max_features=None, 
                                        validation_fraction=0.1, n_iter_no_change=None, tol=0.0001)
model1.fit(X_train, y_train)

In [None]:
y_pred = model1.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print_metrics("Gradient Boosting", "MinMax normalization", y_test, y_pred)

### Gradient Boosting with Min Max Z-Score Normalized data

In [None]:
X_train, X_test, y_train, y_test = split_data(water_quality_data_z, columns)
model1 = GradientBoostingClassifier(loss='deviance', learning_rate=0.1, n_estimators=100, max_depth=3, random_state=42, 
                                        min_samples_split=2, min_samples_leaf=1, subsample=1.0, max_features=None, 
                                        validation_fraction=0.1, n_iter_no_change=None, tol=0.0001)
model1.fit(X_train, y_train)

In [None]:
y_pred = model1.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print_metrics("Gradient Boosting", "Z-Score normalization", y_test, y_pred)

### Hyperparameter tuning

In [None]:
param_grid = {
    'learning_rate': [0.01, 0.1],
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7]
}

In [None]:
grid_search = GridSearchCV(model1, param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)

In [None]:
print("Best hyperparameters:", grid_search.best_params_)

best_param_gb = grid_search.best_params_

y_pred = grid_search.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print_metrics("Gradient Boosting", "Grid Search CV", y_test, y_pred)

## Tricks to tackle baised data

1. SMOTE with SVM
2. SMOTE with ANN
3. SMOTE with Gradient Boosting


### SVM

In [None]:

smote = SMOTE(random_state=45)

X_train, X_test, y_train, y_test = split_data(water_quality_data_z, columns)
new_X_train, new_y_train = smote.fit_resample(X_train, y_train)

In [None]:
svm = SVC(kernel='rbf')

In [None]:
svm.fit(new_X_train, new_y_train)
y_pred_6 = svm.predict(X_test)

print_metrics("SVM with SMOTE", "Z-Score normalization", y_test, y_pred_6)

### Artificial Nueral Networks

In [None]:
clf = MLPClassifier(random_state=42, max_iter=950, hidden_layer_sizes=(100), learning_rate='adaptive', learning_rate_init=0.0001)

# On smote data

X_train, X_test, y_train, y_test = split_data(water_quality_data_z, columns)
new_X_train, new_y_train = smote.fit_resample(X_train, y_train)

In [None]:
clf.fit(new_X_train, new_y_train)

In [None]:
y_pred_clf = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print_metrics("ANN with SMOTE ", "Z-Score normalization", y_test, y_pred_clf)

### Gradient Boosting


In [None]:
model_gb = GradientBoostingClassifier(loss='deviance', learning_rate=best_param_gb['learning_rate'], n_estimators=best_param_gb['n_estimators'], max_depth=best_param_gb['max_depth'], random_state=42, 
                                        min_samples_split=2, min_samples_leaf=1, subsample=1.0, max_features=None, 
                                        validation_fraction=0.1, n_iter_no_change=None, tol=0.0001)

# On smote data
X_train, X_test, y_train, y_test = split_data(water_quality_data_z, columns)
new_X_train, new_y_train = smote.fit_resample(X_train, y_train)

In [None]:
model_gb.fit(new_X_train, new_y_train)


In [None]:
y_pred_gb = model_gb.predict(X_test)
accuracy = accuracy_score(y_test, y_pred_gb)
print_metrics("ANN with SMOTE ", "Z-Score normalization", y_test, y_pred_gb)