In [1]:
# import modin.pandas as pd
# from modin.config import Engine
# Engine.put("dask") 

In [2]:
from sklearnex import patch_sklearn
patch_sklearn()

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [3]:
import os
import time
import warnings
import pandas as pd
import numpy as np

from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import roc_auc_score, roc_curve, auc, accuracy_score, f1_score
from sklearn.metrics import precision_recall_curve, average_precision_score
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder, RobustScaler
from sklearn.experimental import enable_iterative_imputer
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import BayesianRidge

import daal4py as d4p
import matplotlib.colors
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [4]:
raw = pd.read_csv("500k_dataset.csv")

selected_columns = ['Color', 'Turbidity', 'Copper', 'Manganese', 'Chloride', 'Iron','Fluoride', 'Nitrate',
                    'Odor', 'Chlorine', 'Sulfate', 'Total Dissolved Solids', 'Zinc', 'Lead', 'pH', 'Target']
data = raw.loc[:, selected_columns]
print("Total rows in the DataFrame:", len(data))

Total rows in the DataFrame: 595684


In [5]:
# handle missing values
start = time.time()

# cat_imputer = SimpleImputer(strategy='most_frequent')
# # num_imputer = IterativeImputer(estimator=RandomForestRegressor(n_estimators=5, random_state=0, n_jobs=-1), max_iter=5)
# # num_imputer = KNNImputer(n_neighbors=2)

# cat_cols = data.select_dtypes(include=['object']).columns
# num_cols = data.select_dtypes(exclude=['object']).columns

# data[cat_cols] = cat_imputer.fit_transform(data[cat_cols])
# # data[num_cols] = num_imputer.fit_transform(data[num_cols])
# for col in num_cols:
#     data[col] = data[col].interpolate(method='linear')

data.dropna(subset=data.columns, inplace=True)

print("handle missing value took: {:.2f} s".format(time.time() - start))

handle missing value took: 0.13 s


In [6]:
color_dict = {
    'Colorless': 0,
    'Near Colorless': 1,
    'Faint Yellow': 2,
    'Light Yellow': 3,
    'Yellow': 4
}
data['Color'] = data['Color'].map(color_dict)

In [7]:
# from sklearn.preprocessing import MinMaxScaler

# # # normalization
# start = time.time()

# normalize_columns = ['Iron', 'Lead', 'Copper', 'Manganese', 'Fluoride', 'Turbidity', 'Zinc',
#                      'Nitrate', 'Chloride', 'Sulfate', 'Chlorine', 'Total Dissolved Solids']
# min_max_scaler = MinMaxScaler()

# for col in normalize_columns:
#     if col in data.columns:
#         data[col] = min_max_scaler.fit_transform(data[[col]])
        
# print("normalization took: {:.2f} s".format(time.time() - start))

In [8]:
# handle outliers
start = time.time()

handle_upper_cols = ['Iron', 'Lead', 'Zinc', 'Turbidity', 'Fluoride', 'Copper', 'Manganese', 'Nitrate',
                     'Chloride','Sulfate','Chlorine','Water Temperature','Air Temperature']
handle_lower_cols = ['Nitrate','Chloride', 'Sulfate','Chlorine', 'Air Temperature']

for col in data.columns:
    if data[col].dtype == 'object':
        continue
    lower, upper = np.percentile(data[col], [1, 99])
    if col in handle_upper_cols:
        data.loc[data[col] > upper, col] = upper
    if col in handle_lower_cols:
        data.loc[data[col] < lower, col] = lower

print("handle outliers took: {:.2f} s".format(time.time() - start))

handle outliers took: 0.11 s


In [9]:
# handle distribution

start = time.time()

# from scipy.stats import yeojohnson
# for col in ['Nitrate','Chloride','Turbidity','Sulfate']:
#     if col in data.columns:
#         data[col] = np.log1p(data[col])

# for col in ['Iron','Lead','Manganese']:
#     if col in data.columns:
#         data[col], _ = yeojohnson(data[col])

scaler = StandardScaler()
for col in ['Conductivity', 'Total Dissolved Solids', 'Water Temperature', 'Air Temperature']:
    if col not in data.columns:
        continue
    data[col] = scaler.fit_transform(data[[col]])

print("handle distribution took: {:.2f} s".format(time.time() - start))

handle distribution took: 0.07 s


In [10]:
# discretize features
start = time.time()

discretized_features = ['pH', 'Odor']
labels_dict = {
    'pH': ['Acidic', 'Neutral', 'Alkaline'],
    'Odor': ['No Odor', 'Slight Odor', 'Noticeable Odor', 'Strong Odor']
}

data['pH'] = pd.cut(data['pH'], bins=[0, 6, 8.5, 14], labels=labels_dict['pH'], include_lowest=True)
data['Odor'] = pd.cut(data['Odor'], bins=[-np.inf, 1, 2, 3, np.inf], labels=labels_dict['Odor'], include_lowest=True)

# transform labels to numerics
for _, column in enumerate(discretized_features):
    if column not in data:
        continue
    if column in labels_dict:
        labels = labels_dict[column]
        category = pd.Categorical(data[column], categories=labels, ordered=True)
        data[column] = category.codes
    else:
        codes, labels = pd.factorize(data[column])
        data[column] = codes

print("discretize features took: {:.2f} s".format(time.time() - start))

discretize features took: 0.06 s


In [11]:
# drop duplicated rows
start = time.time()

data = data.drop_duplicates()

print("drop duplicates took: {:.2f} s".format(time.time() - start))

missing = data.isna().sum().sum()
duplicates = data.duplicated().sum()
print("\nThere are {:,.0f} missing values in the data.".format(missing))
print("There are {:,.0f} duplicate records in the data.".format(duplicates))
display(data.head())

drop duplicates took: 0.41 s

There are 0 missing values in the data.
There are 0 duplicate records in the data.


Unnamed: 0,Color,Turbidity,Copper,Manganese,Chloride,Iron,Fluoride,Nitrate,Odor,Chlorine,Sulfate,Total Dissolved Solids,Zinc,Lead,pH,Target
0,4,0.231956,0.715433,0.4484932,119.769705,1.1546740000000001e-17,2.108228,7.6312,1,3.270075,230.194138,-1.125526,2.693937,6.350348e-86,1,1
1,3,1.051458,0.773502,1.540051,150.015099,0.003600729,1.422666,11.236134,0,4.001868,88.068009,-1.156116,3.590989,3.99267e-61,2,1
2,1,0.001308,0.037652,3.1907e-07,126.665161,0.2242093,0.718139,9.8383,3,3.743012,92.617198,0.828813,1.265744,8.205945e-08,1,0
3,1,0.001784,0.206894,0.007835654,90.619648,9.019033e-08,0.139084,4.363331,1,3.854285,209.857103,-0.4222,1.583872,1.850471e-138,1,0
4,2,0.733317,1.005681,8.823558e-07,212.13067,9.183142e-07,0.179237,9.636736,0,2.222523,117.492988,0.171229,0.800412,7.90796e-56,1,0


In [12]:
def prepare_train_test_data(data, target_col, test_size):
    
    """
    Function to scale and split the data into training and test sets
    """
    
    scaler = RobustScaler()   
    X = data.drop(target_col, axis=1)
    y = data[target_col]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=21)
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    print("Train Shape: {}".format(X_train_scaled.shape))
    print("Test Shape: {}".format(X_test_scaled.shape))
    
    return X_train_scaled, X_test_scaled, y_train, y_test

In [13]:
## Prepare Train and Test datasets ##
print("Preparing Train and Test datasets")
X_train, X_test, y_train, y_test = prepare_train_test_data(data=data, target_col='Target', test_size=.25)

Preparing Train and Test datasets
Train Shape: (338796, 15)
Test Shape: (112932, 15)


In [14]:
# # Initialize SVC model with the best hyperparameters ##
# parameters = {
#     'class_weight': 'balanced',
#     'probability': True,
#     'random_state': 21,
#     'C': 1.0,
#     'kernel': 'rbf'
# }
# svc = SVC(**parameters)

# ## Fit the model ##
# start_fit = time.time()
# print("\nFitting the model..")
# svc.fit(X_train, y_train)
# print("Fit took: ", time.time() - start_fit)

# svc_prob = svc.predict_proba(X_test)[:,1]
# svc_pred = pd.Series(svc.predict(X_test), name='Target')
# svc_auc = roc_auc_score(y_test, svc_prob)
# svc_f1 = f1_score(y_test, svc_pred)  

# ## Print model results ##
# print("\nSVC Test F1 accuracy: {:.2f}%, AUC: {:.5f}".format(svc_f1*100,svc_auc))

In [15]:
## Initialize XGBoost model with the best hyperparameters ##
ratio = float(np.sum(y_train == 0)) / np.sum(y_train == 1)
best_hyperparameters = {
    'scale_pos_weight': ratio.round(2), 
    'tree_method': 'hist',
    'random_state': 21,
#     'gamma': 5, 'max_depth': 5, 'min_child_weight': 5
#     'gamma': 1.5, 'max_depth': 5, 'min_child_weight': 1
#     'gamma': 1, 'max_depth': 5, 'min_child_weight': 10
    'gamma': 0.5, 'max_depth': 5, 'min_child_weight': 5
}
xgb = XGBClassifier(**best_hyperparameters)

print("\nFitting the model..")
start_fit = time.time()
xgb.fit(X_train, y_train)
print("Fit took: ", time.time() - start_fit)

xgb_prob = xgb.predict_proba(X_test)[:,1]
xgb_pred = pd.Series(xgb.predict(X_test), name='Target')
xgb_auc = roc_auc_score(y_test, xgb_prob)
xgb_f1 = f1_score(y_test, xgb_pred)  

## Print model results ##
print("\nXGB Test F1 accuracy: {:.2f}%, AUC: {:.5f}".format(xgb_f1*100,xgb_auc))


Fitting the model..
Fit took:  0.5682427883148193

XGB Test F1 accuracy: 81.92%, AUC: 0.91545


In [16]:
## Initialize RandomForest model with the best hyperparameters ##
ratio = float(np.sum(y_train == 0)) / np.sum(y_train == 1)
best_hyperparameters = {
    'class_weight': 'balanced',
    'random_state': 21,
#     'n_estimators': 200, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': 10, 'bootstrap': True
    'n_estimators': 200, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_depth': 20, 'bootstrap': False
}
rfc = RandomForestClassifier(**best_hyperparameters)

print("\nFitting the model..")
start_fit = time.time()
rfc.fit(X_train, y_train)
print("Fit took: ", time.time() - start_fit)

rfc_prob = rfc.predict_proba(X_test)[:,1]
rfc_pred = pd.Series(rfc.predict(X_test), name='Target')
rfc_auc = roc_auc_score(y_test, rfc_prob)
rfc_f1 = f1_score(y_test, rfc_pred)  

## Print model results ##
print("\nRF Test F1 accuracy: {:.2f}%, AUC: {:.5f}".format(rfc_f1*100,rfc_auc))


Fitting the model..
Fit took:  11.60996127128601

RF Test F1 accuracy: 82.33%, AUC: 0.91513


In [17]:
## Initialize RandomForest model with the best hyperparameters ##
ratio = float(np.sum(y_train == 0)) / np.sum(y_train == 1)
best_hyperparameters = {
    'max_iter': 200,
    'random_state': 21,
#     'solver': 'sgd', 'learning_rate': 'adaptive', 'hidden_layer_sizes': (50, 100, 50), 'alpha': 0.05, 'activation': 'relu'
    'solver': 'adam', 'learning_rate': 'constant', 'hidden_layer_sizes': (50, 100, 50), 'alpha': 0.05, 'activation': 'relu'
}
nn = MLPClassifier(**best_hyperparameters)

print("\nFitting the model..")
start_fit = time.time()
nn.fit(X_train, y_train)
print("Fit took: ", time.time() - start_fit)

nn_prob = nn.predict_proba(X_test)[:,1]
nn_pred = pd.Series(nn.predict(X_test), name='Target')
nn_auc = roc_auc_score(y_test, nn_prob)
nn_f1 = f1_score(y_test, nn_pred)  

## Print model results ##
print("\nNN Test F1 accuracy: {:.2f}%, AUC: {:.5f}".format(nn_f1*100,nn_auc))


Fitting the model..
Fit took:  387.7050738334656

NN Test F1 accuracy: 82.19%, AUC: 0.91446
