In [1]:
import os
os.chdir('..')

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearnex import patch_sklearn
patch_sklearn()

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split

pd.set_option('display.max_columns', None)

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [31]:
original_data = pd.read_csv('data/dataset.csv')

df = original_data.copy()
df.head()

Unnamed: 0,Index,pH,Iron,Nitrate,Chloride,Lead,Zinc,Color,Turbidity,Fluoride,Copper,Odor,Sulfate,Conductivity,Chlorine,Manganese,Total Dissolved Solids,Source,Water Temperature,Air Temperature,Month,Day,Time of Day,Target
0,0,8.33,0.0,8.61,122.8,0.0,3.43,Colorless,0.02,0.61,0.14,1.63,87.27,471.68,3.71,0.0,332.12,,,43.49,January,29.0,4.0,0
1,1,6.92,0.0,3.73,227.03,0.0,1.25,Faint Yellow,0.02,0.62,0.44,1.69,144.01,432.84,3.29,0.0,284.64,Lake,15.35,71.22,November,26.0,16.0,0
2,2,5.44,0.02,3.82,231.0,0.0,0.53,Light Yellow,0.32,0.42,0.43,3.41,275.7,990.2,3.56,0.07,570.05,River,11.64,44.89,January,31.0,8.0,0
3,3,7.96,0.14,8.22,178.13,0.0,4.03,Near Colorless,0.17,0.21,0.24,,147.16,237.03,3.52,0.02,100.04,Ground,10.09,60.84,April,1.0,21.0,0
4,4,8.09,0.0,9.93,186.54,0.0,3.81,Light Yellow,0.0,0.22,0.62,0.8,175.28,385.03,3.18,0.0,168.08,Spring,15.25,69.34,June,29.0,7.0,0


In [32]:
df.shape

(5956842, 24)

### Handelling Missing Values

In [39]:
# Filling with mean - 

missing_val_columns = ['pH', 'Iron', 'Nitrate', 'Chloride', 'Lead', 'Zinc',
       'Turbidity', 'Fluoride', 'Copper', 'Odor', 'Sulfate', 'Conductivity',
       'Chlorine', 'Manganese', 'Total Dissolved Solids', 'Water Temperature', 'Air Temperature']

for col in missing_val_columns:
    df[col].fillna(df[col].mean(), inplace = True)

In [40]:
# Filling with mapping

df['Color'].fillna('Near Colorless', inplace=True)
color_mapping = df.groupby('Color')['Color'].transform('count') / len(df)

df['Color'] = color_mapping

In [41]:
# Filling with mode

df['Source'] = df['Source'].fillna('Stream')

encoder = OrdinalEncoder()
df['Source'] = encoder.fit_transform(df[['Source']])

In [42]:
# Deleting non important columns

del df['Day']
del df['Index']
del df['Month']
del df['Time of Day']

In [43]:
df.columns

Index(['pH', 'Iron', 'Nitrate', 'Chloride', 'Lead', 'Zinc', 'Color',
       'Turbidity', 'Fluoride', 'Copper', 'Odor', 'Sulfate', 'Conductivity',
       'Chlorine', 'Manganese', 'Total Dissolved Solids', 'Source',
       'Water Temperature', 'Air Temperature', 'Target'],
      dtype='object')

### Feature Engineering

In [None]:
# Binning Iron based on the histogram (Low, Moderate and High)

bin_edges = [0, 0.1, 1, 20]
bin_labels = [0, 0.4, 1]
df['Iron_Bin'] = pd.cut(df['Iron'], bins = bin_edges, labels = bin_labels)

# Nitrate
bin_edges = [0, 1, 5, 100] 
df['Nitrate_Bin'] = pd.cut(df['Nitrate'], bins=bin_edges, labels=bin_labels)

# Copper
bin_edges = [0, 0.02, 1, 20]
df['Copper_Bin'] = pd.cut(df['Copper'], bins=bin_edges, labels=bin_labels)

### Validation Strategy

In [None]:
# Choosing two validation datasets, in order to get a better idea of the model performance.
# Taking only 10% of the data for validation because of the very large size of the dataset.

X = df.drop('Target', axis = 1)
y = df['Target']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.1, random_state = 42, shuffle = True, stratify = y)
X_val_1, X_val_2, y_val_1, y_val_2 = train_test_split(X_val, y_val, test_size = 0.5, random_state = 42, stratify = y_val)

X_train.shape, X_val_1.shape, X_val_2.shape, y_train.shape, y_val_1.shape, y_val_2.shape

In [None]:
scaler = MinMaxScaler()
scaler.fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_val_1_scaled = scaler.transform(X_val_1)
X_val_2_scaled = scaler.transform(X_val_2)

In [None]:
X_train.info()

### Training

In [None]:
import daal4py as d4p
import lightgbm as lgb
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, recall_score, ConfusionMatrixDisplay, f1_score

In [None]:
clf = LGBMClassifier(objective = 'binary',
                     metric = 'binary_logloss',
                     random_state = 42,
                     n_jobs = -1,
                     force_row_wise = True)

callback = lgb.early_stopping(stopping_rounds = 10)
clf.fit(X_train_scaled, y_train, eval_set=[(X_val_2_scaled, y_val_2)], callbacks = [callback])

In [None]:
print('predicting -')
y_pred_1 = clf.predict(X_val_1_scaled)
y_pred_2 = clf.predict(X_val_2_scaled)

print('Accuracy on validation set 1: ', accuracy_score(y_val_1, y_pred_1))
print('Accuracy on validation set 2: ', accuracy_score(y_val_2, y_pred_2))
print('Recall on validation set 1: ', recall_score(y_val_1, y_pred_1))
print('Recall on validation set 2: ', recall_score(y_val_2, y_pred_2))
print('F1 on validation set 1: ', f1_score(y_val_1, y_pred_1))
print('F1 on validation set 2: ', f1_score(y_val_2, y_pred_2))

In [None]:
plt.bar(X_train.columns, clf.feature_importances_)
plt.xticks(rotation = 90)

plt.show()

In [None]:
cm = confusion_matrix(y_val_1, y_pred_1)
ConfusionMatrixDisplay(cm).plot()

### Serialization

In [None]:
import joblib

with open('model/model.joblib','wb') as out:
    joblib.dump(clf, out)

joblib.dump(scaler, 'model/scaler.joblib')
joblib.dump(encoder, 'model/ordinal_encoder.joblib')

In [None]:
df.head()