## 0. Setup


In [4]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Numpy arrays are used to store training and test data.
import numpy as np

# Pandas is used to manipulate tabular data.
import pandas as pd

# Matplotlib is used to plot graphs.
%matplotlib inline 
import matplotlib as mpl
import matplotlib.pyplot as plt
# Style options for plots.
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Ignore useless warnings (see SciPy issue #5998).
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

# Convenience function to create display a progress bar.
# Source : https://stackoverflow.com/questions/3173320/text-progress-bar-in-the-console
def print_progress_bar (iteration, total, prefix = '', suffix = '', decimals = 1, length = 100, fill = '█', printEnd = "\r"):
    """
    Call in a loop to create terminal progress bar
    @params:
        iteration   - Required  : current iteration (Int)
        total       - Required  : total iterations (Int)
        prefix      - Optional  : prefix string (Str)
        suffix      - Optional  : suffix string (Str)
        decimals    - Optional  : positive number of decimals in percent complete (Int)
        length      - Optional  : character length of bar (Int)
        fill        - Optional  : bar fill character (Str)
        printEnd    - Optional  : end character (e.g. "\r", "\r\n") (Str)
    """
    percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))
    filledLength = int(length * iteration // total)
    bar = fill * filledLength + '-' * (length - filledLength)
    print(f'\r{prefix} |{bar}| {percent}% {suffix}', end = printEnd)
    # Print New Line on Complete
    if iteration == total: 
        print()
 
# Saves a figure to a file
def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join("./figs", fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

In [5]:
from sklearn.model_selection import train_test_split
dataset = pd.read_csv("drinking_water_potability.csv")

train_set, test_set = train_test_split(
    dataset, train_size=0.75, random_state=42, stratify=dataset["Potability"]
)

target = "Potability"
feature_names = list(filter(lambda x: x != target, train_set.columns))

In [9]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer

#Choosing the feature engineering pipeline
feature_engineering = 'mutlivariate_min_max'

pipelines = {'univariate' : Pipeline([('mean_imputer', SimpleImputer(strategy='mean')),
                                      ('std_scaler', StandardScaler())]),
             'mutlivariate' : Pipeline([('it_imputer', IterativeImputer(sample_posterior=True,random_state=666)),
                                        ('std_scaler', StandardScaler())]),
             'mutlivariate_min_max' : Pipeline([('it_imputer', IterativeImputer(sample_posterior=True,random_state=666)),
                                                ('min_max_scaler', MinMaxScaler())]) }

used_pipeline = pipelines[feature_engineering]

X_scaled = used_pipeline.fit_transform(train_set.drop(["Potability"],axis=1))

data_processed = pd.merge(pd.DataFrame(X_scaled,columns=feature_names)

In [10]:
from sklearn.ensemble import (
    AdaBoostClassifier,
    ExtraTreesClassifier,
    GradientBoostingClassifier,
    RandomForestClassifier,
)
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    make_scorer,
    precision_score,
    recall_score,
)
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

models = [
    DecisionTreeClassifier(max_depth=7),
    LogisticRegression(),
    SVC(),
    KNeighborsClassifier(),
    GaussianNB(),
    RandomForestClassifier(),
    ExtraTreesClassifier(n_estimators=10),
    AdaBoostClassifier(),
    GradientBoostingClassifier(),
    GaussianNB(),
]

metrics = [accuracy_score, f1_score, precision_score, recall_score]
results = []

In [11]:
X_train, X_val, y_train, y_val = train_test_split(
    data_processed[feature_names], data_processed[target], train_size=0.8, random_state=20
)
for model in tqdm(models, desc="Training different models", unit="model"):
    scores = {"name": model.__class__.__name__}
    model = Pipeline([("transformer", transformer), ("model", model)])
    scores = {
        **scores,
        **{
            "cv_"
            + func.__name__: np.mean(
                cross_val_score(
                    model, X_train, y_train, cv=5, scoring=make_scorer(func)
                )
            )
            for func in metrics
        },
    }
    results.append(scores)
results_df = pd.DataFrame.from_records(results)

KeyError: 'Potability'

In [None]:
results_df.sort_values(by=["cv_precision_score"], ascending=False)

results_df.set_index("name")["cv_precision_score"].sort_values().plot.bar(
    ylim=[0, 1], grid=True
)

X_train, y_train = train_set[feature_names], train_set[target]
X_test, y_test = test_set[feature_names], test_set[target]
