In [1]:
# Importing Libraries
import pandas as pd  # Dataframe Manipulation  
import numpy as np  # Array/lists Handlings
import matplotlib.pyplot as plt  # Data Visualization
import seaborn as sns  # For data visualization
from pandas.api.types import is_numeric_dtype
import joblib 
import os

In [2]:
# Importing Dataset
df = pd.read_csv("Water Quality Prediction.csv")

In [3]:
df.shape

(1048575, 23)

In [4]:
df.head()

Unnamed: 0,pH,Iron,Nitrate,Chloride,Lead,Zinc,Color,Turbidity,Fluoride,Copper,...,Chlorine,Manganese,Total Dissolved Solids,Source,Water Temperature,Air Temperature,Month,Day,Time of Day,Target
0,8.332988,8.3e-05,8.605777,122.799772,3.71e-52,3.434827,Colorless,0.022683,0.607283,0.144599,...,3.708178,2.27e-15,332.118789,,,43.493324,January,29.0,4.0,0
1,6.917863,8.1e-05,3.734167,227.029851,7.85e-94,1.245317,Faint Yellow,0.019007,0.622874,0.437835,...,3.292038,8.02e-07,284.641984,Lake,15.348981,71.220586,November,26.0,16.0,0
2,5.443762,0.020106,3.816994,230.99563,5.29e-76,0.52828,Light Yellow,0.319956,0.423423,0.431588,...,3.560224,0.07007989,570.054094,River,11.643467,44.89133,January,31.0,8.0,0
3,7.955339,0.143988,8.224944,178.12994,4e-176,4.027879,Near Colorless,0.166319,0.208454,0.239451,...,3.516907,0.02468295,100.043838,Ground,10.092392,60.843233,April,1.0,21.0,0
4,8.091909,0.002167,9.925788,186.540872,4.17e-132,3.807511,Light Yellow,0.004867,0.222912,0.616574,...,3.177849,0.003296139,168.075545,Spring,15.249416,69.336671,June,29.0,7.0,0


In [5]:
df.shape

(1048575, 23)

In [6]:
# Data Information
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 23 columns):
 #   Column                  Non-Null Count    Dtype  
---  ------                  --------------    -----  
 0   pH                      1028344 non-null  float64
 1   Iron                    1041584 non-null  float64
 2   Nitrate                 1029880 non-null  float64
 3   Chloride                1017741 non-null  float64
 4   Lead                    1043891 non-null  float64
 5   Zinc                    1020900 non-null  float64
 6   Color                   1047594 non-null  object 
 7   Turbidity               1039881 non-null  float64
 8   Fluoride                1015357 non-null  float64
 9   Copper                  1013693 non-null  float64
 10  Odor                    1017243 non-null  float64
 11  Sulfate                 1014050 non-null  float64
 12  Conductivity            1019772 non-null  float64
 13  Chlorine                1038413 non-null  float64
 14  Ma

### Label Encoding

In [7]:
# Label Encoding
from sklearn.preprocessing import LabelEncoder # type: ignore
# Initialize encoder
encoder = LabelEncoder()

In [8]:
df["Source"] = encoder.fit_transform(df["Source"])  
df["Color"] = encoder.fit_transform(df["Color"])  
df["Month"] = encoder.fit_transform(df["Month"]) 

In [9]:
# Checking unique values after encoding
print(df["Source"].unique())
print(df["Month"].unique())
print(df["Color"].unique())

[8 2 4 1 5 6 0 3 7]
[ 4  9  0  6  7 11  8  5  1 10  2  3 12]
[0 1 2 3 4 5]


### Handling Missing and null values

In [10]:
# Handling Missing values
df.isna().sum()

pH                        20231
Iron                       6991
Nitrate                   18695
Chloride                  30834
Lead                       4684
Zinc                      27675
Color                         0
Turbidity                  8694
Fluoride                  33218
Copper                    34882
Odor                      31332
Sulfate                   34525
Conductivity              28803
Chlorine                  10162
Manganese                 19339
Total Dissolved Solids      298
Source                        0
Water Temperature         29688
Air Temperature            5303
Month                         0
Day                       17549
Time of Day               20361
Target                        0
dtype: int64

In [11]:
# Data Cleaning
numeric_columns = []
for i in df.columns:
    if is_numeric_dtype(df[i]):
        numeric_columns.append(i)

In [12]:
for i in numeric_columns:
    if -0.5 < df[i].skew() < 0.5:
        df.fillna(df[i].mean(), inplace=True)
        print(df[i].mean())
    else:
        df.fillna(df[i].median(), inplace=True)
        print(df[i].median())

7.449869487765477
0.002167424
5.628327343
172.4878517
3.7899999999999997e-62
1.120358173
1.6705366807333764
0.203854896
0.794681686
0.357012449
1.798200478
131.3759622
390.5935192
3.207667322
0.000622466
264.71409588383455
3.5672527000929835
16.13712633
59.766584835496815
5.6105714898791215
15.593353608126074
11.412160115051753
0.0


In [13]:
print(numeric_columns)
df.head()

['pH', 'Iron', 'Nitrate', 'Chloride', 'Lead', 'Zinc', 'Color', 'Turbidity', 'Fluoride', 'Copper', 'Odor', 'Sulfate', 'Conductivity', 'Chlorine', 'Manganese', 'Total Dissolved Solids', 'Source', 'Water Temperature', 'Air Temperature', 'Month', 'Day', 'Time of Day', 'Target']


Unnamed: 0,pH,Iron,Nitrate,Chloride,Lead,Zinc,Color,Turbidity,Fluoride,Copper,...,Chlorine,Manganese,Total Dissolved Solids,Source,Water Temperature,Air Temperature,Month,Day,Time of Day,Target
0,8.332988,8.3e-05,8.605777,122.799772,3.71e-52,3.434827,0,0.022683,0.607283,0.144599,...,3.708178,2.27e-15,332.118789,8,7.449869,43.493324,4,29.0,4.0,0
1,6.917863,8.1e-05,3.734167,227.029851,7.85e-94,1.245317,1,0.019007,0.622874,0.437835,...,3.292038,8.02e-07,284.641984,2,15.348981,71.220586,9,26.0,16.0,0
2,5.443762,0.020106,3.816994,230.99563,5.29e-76,0.52828,2,0.319956,0.423423,0.431588,...,3.560224,0.07007989,570.054094,4,11.643467,44.89133,4,31.0,8.0,0
3,7.955339,0.143988,8.224944,178.12994,4e-176,4.027879,3,0.166319,0.208454,0.239451,...,3.516907,0.02468295,100.043838,1,10.092392,60.843233,0,1.0,21.0,0
4,8.091909,0.002167,9.925788,186.540872,4.17e-132,3.807511,2,0.004867,0.222912,0.616574,...,3.177849,0.003296139,168.075545,5,15.249416,69.336671,6,29.0,7.0,0


In [14]:
df.isna().sum()

pH                        0
Iron                      0
Nitrate                   0
Chloride                  0
Lead                      0
Zinc                      0
Color                     0
Turbidity                 0
Fluoride                  0
Copper                    0
Odor                      0
Sulfate                   0
Conductivity              0
Chlorine                  0
Manganese                 0
Total Dissolved Solids    0
Source                    0
Water Temperature         0
Air Temperature           0
Month                     0
Day                       0
Time of Day               0
Target                    0
dtype: int64

In [15]:
# Exporting the preprocessed data to a new CSV file
file_name = "cleaned_data.csv"

# Checking if the file already exists or not 
if not os.path.exists(file_name):
    df.to_csv(file_name, index=False)
    print(f"File '{file_name}' saved successfully.")
else:
    print(f"File '{file_name}' already exists.So skipping export.")

File 'cleaned_data.csv' saved successfully.
