In [1]:
import os
os.chdir('..')

import joblib
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, FunctionTransformer
from sklearn.compose import ColumnTransformer

In [21]:
# Load the data
df = pd.read_csv('data/dataset.csv')
del df['Target']

In [22]:
df.head()

Unnamed: 0,Index,pH,Iron,Nitrate,Chloride,Lead,Zinc,Color,Turbidity,Fluoride,...,Conductivity,Chlorine,Manganese,Total Dissolved Solids,Source,Water Temperature,Air Temperature,Month,Day,Time of Day
0,0,8.332988,8.3e-05,8.605777,122.799772,3.713298e-52,3.434827,Colorless,0.022683,0.607283,...,471.683357,3.708178,2.269945e-15,332.118789,,,43.493324,January,29.0,4.0
1,1,6.917863,8.1e-05,3.734167,227.029851,7.849261999999999e-94,1.245317,Faint Yellow,0.019007,0.622874,...,432.844908,3.292038,8.024076e-07,284.641984,Lake,15.348981,71.220586,November,26.0,16.0
2,2,5.443762,0.020106,3.816994,230.99563,5.2866160000000004e-76,0.52828,Light Yellow,0.319956,0.423423,...,990.201209,3.560224,0.07007989,570.054094,River,11.643467,44.89133,January,31.0,8.0
3,3,7.955339,0.143988,8.224944,178.12994,3.997118e-176,4.027879,Near Colorless,0.166319,0.208454,...,237.028467,3.516907,0.02468295,100.043838,Ground,10.092392,60.843233,April,1.0,21.0
4,4,8.091909,0.002167,9.925788,186.540872,4.171069e-132,3.807511,Light Yellow,0.004867,0.222912,...,385.025855,3.177849,0.003296139,168.075545,Spring,15.249416,69.336671,June,29.0,7.0


### Preprocessing Pipeline

In [10]:
encoder = joblib.load('model/ordinal_encoder.joblib')
scaler = joblib.load('model/scaler.joblib')

def fill_missing_with_mean(X):
    missing_val_columns = ['pH', 'Iron', 'Nitrate', 'Chloride', 'Lead', 'Zinc',
                           'Turbidity', 'Fluoride', 'Copper', 'Odor', 'Sulfate', 'Conductivity',
                           'Chlorine', 'Manganese', 'Total Dissolved Solids', 'Water Temperature', 'Air Temperature']
    
    for col in missing_val_columns:
        X[col].fillna(X[col].mean(), inplace=True)
    return X

def fill_color_mapping(X):
    X['Color'].fillna('Near Colorless', inplace=True)
    color_mapping = X.groupby('Color')['Color'].transform('count') / len(X)
    X['Color'] = color_mapping
    return X

def fill_source_with_mode(X):
    X['Source'] = X['Source'].fillna('Stream')
    X['Source'] = encoder.fit_transform(X[['Source']])
    return X

def delete_non_important_columns(X):
    del X['Day']
    del X['Index']
    del X['Month']
    del X['Time of Day']
    return X

def create_new_columns(X):
    # Iron
    bin_edges = [0, 0.1, 1, 20]
    bin_labels = [0, 0.4, 1]
    X['Iron_Bin'] = pd.cut(X['Iron'], bins=bin_edges, labels=bin_labels)
    
    # Nitrate
    bin_edges = [0, 1, 5, 100] 
    X['Nitrate_Bin'] = pd.cut(X['Nitrate'], bins=bin_edges, labels=bin_labels)
    
    # Copper
    bin_edges = [0, 0.02, 1, 20]
    X['Copper_Bin'] = pd.cut(X['Copper'], bins=bin_edges, labels=bin_labels)
    return X

def scale_features(X):
    return scaler.transform(X)

# Define the preprocessing pipeline
preprocessing_pipeline = Pipeline([
    ('fill_missing_with_mean', FunctionTransformer(fill_missing_with_mean)),
    ('fill_color_mapping', FunctionTransformer(fill_color_mapping)),
    ('fill_source_with_mode', FunctionTransformer(fill_source_with_mode)),
    ('delete_non_important_columns', FunctionTransformer(delete_non_important_columns)),
    ('create_new_columns', FunctionTransformer(create_new_columns)),
    ('scale_features', FunctionTransformer(scale_features))
])

In [14]:
cleaned_df = preprocessing_pipeline.transform(df)
cleaned_df

array([[6.13811156e-01, 4.31312437e-06, 8.65682172e-02, ...,
        0.00000000e+00, 1.00000000e+00, 4.00000000e-01],
       [4.94427600e-01, 4.16150817e-06, 3.58775094e-02, ...,
        0.00000000e+00, 4.00000000e-01, 4.00000000e-01],
       [3.70068737e-01, 1.03889364e-03, 3.67393536e-02, ...,
        0.00000000e+00, 4.00000000e-01, 4.00000000e-01],
       ...,
       [5.98000214e-01, 2.56852418e-07, 7.10341036e-02, ...,
        0.00000000e+00, 1.00000000e+00, 4.00000000e-01],
       [6.08796634e-01, 6.38825283e-05, 1.02973582e-01, ...,
        0.00000000e+00, 1.00000000e+00, 4.00000000e-01],
       [5.46872396e-01, 1.17604823e-03, 4.80353528e-02, ...,
        0.00000000e+00, 4.00000000e-01, 1.00000000e+00]])

In [20]:
joblib.dump(preprocessing_pipeline, 'model/preprocessing_pipeline.joblib')

['model/preprocessing_pipeline.joblib']

### Predictions

In [15]:
model = joblib.load('model/model.joblib')
model.predict(cleaned_df)