### Importing required libraries

In [3]:
# Importing Libraries
import pandas as pd  
import numpy as np  
import matplotlib.pyplot as plt
import seaborn as sns  
from pandas.api.types import is_numeric_dtype
import joblib 
import random

### Importing dataset 

In [4]:
# Importing Dataset
df = pd.read_csv("Water Quality Prediction.csv")


### Generating  Missing Values in 

In [5]:
def generate_missing_data(df, column_names, missing_percentage=0.1):
    """
    Introduce missing values in the given columns for a specified percentage of rows.
    """
    for column in column_names:
        # Calculating how many missing values to introduce
        num_missing = int(len(df) * missing_percentage)
        missing_rows = np.random.choice(df.index, num_missing, replace=False)
        df.loc[missing_rows, column] = np.nan
    return df

# showing the columns where missing data will be introduced
columns_with_missing_data = ["pH", "Iron", "Source", "Month"]

# Add missing values
df = generate_missing_data(df, columns_with_missing_data, missing_percentage=0.1)
df.head()


Unnamed: 0,pH,Iron,Nitrate,Chloride,Lead,Zinc,Color,Turbidity,Fluoride,Copper,...,Chlorine,Manganese,Total Dissolved Solids,Source,Water Temperature,Air Temperature,Month,Day,Time of Day,Target
0,8.332988,8.3e-05,8.605777,122.799772,3.71e-52,3.434827,Colorless,0.022683,0.607283,0.144599,...,3.708178,2.27e-15,332.118789,,,43.493324,,29.0,4.0,0
1,6.917863,8.1e-05,3.734167,227.029851,7.85e-94,1.245317,Faint Yellow,0.019007,0.622874,0.437835,...,3.292038,8.02e-07,284.641984,Lake,15.348981,71.220586,November,26.0,16.0,0
2,5.443762,0.020106,3.816994,230.99563,5.29e-76,0.52828,Light Yellow,0.319956,0.423423,0.431588,...,3.560224,0.07007989,570.054094,River,11.643467,44.89133,January,31.0,8.0,0
3,7.955339,0.143988,8.224944,178.12994,4e-176,4.027879,Near Colorless,0.166319,0.208454,0.239451,...,3.516907,0.02468295,100.043838,Ground,10.092392,60.843233,April,1.0,21.0,0
4,8.091909,0.002167,9.925788,186.540872,4.17e-132,3.807511,Light Yellow,0.004867,0.222912,0.616574,...,3.177849,0.003296139,168.075545,Spring,15.249416,69.336671,June,29.0,7.0,0


### Checking for Outliers

In [6]:
def introduce_outliers(df, column_names, outlier_percentage=0.05):
    """
    Introduces extreme outlier values (outliers) in the specified columns for a given percentage of rows.
    
    Parameters:
    - column_names: List of column names where outliers will be introduced
    - outlier_percentage: Percentage of rows to introduce outliers (default: 5%)
    """
    for column in column_names:
        num_outliers = int(len(df) * outlier_percentage)
        outlier_indices = np.random.choice(df.index, num_outliers, replace=False)
        df.loc[outlier_indices, column] = df[column].mean() + 10 * df[column].std() 
        print(f"Introduced outliers in {column}: {num_outliers} rows")
    return df

# Specify columns to introduce outliers
columns_with_outliers = ["pH", "Iron", "Lead"]

# Introducing outliers
df = introduce_outliers(df, columns_with_outliers, outlier_percentage=0.05)
df.head()  


Introduced outliers in pH: 52428 rows
Introduced outliers in Iron: 52428 rows
Introduced outliers in Lead: 52428 rows


Unnamed: 0,pH,Iron,Nitrate,Chloride,Lead,Zinc,Color,Turbidity,Fluoride,Copper,...,Chlorine,Manganese,Total Dissolved Solids,Source,Water Temperature,Air Temperature,Month,Day,Time of Day,Target
0,8.332988,8.3e-05,8.605777,122.799772,3.71e-52,3.434827,Colorless,0.022683,0.607283,0.144599,...,3.708178,2.27e-15,332.118789,,,43.493324,,29.0,4.0,0
1,6.917863,8.1e-05,3.734167,227.029851,7.85e-94,1.245317,Faint Yellow,0.019007,0.622874,0.437835,...,3.292038,8.02e-07,284.641984,Lake,15.348981,71.220586,November,26.0,16.0,0
2,5.443762,0.020106,3.816994,230.99563,5.29e-76,0.52828,Light Yellow,0.319956,0.423423,0.431588,...,3.560224,0.07007989,570.054094,River,11.643467,44.89133,January,31.0,8.0,0
3,7.955339,0.143988,8.224944,178.12994,4e-176,4.027879,Near Colorless,0.166319,0.208454,0.239451,...,3.516907,0.02468295,100.043838,Ground,10.092392,60.843233,April,1.0,21.0,0
4,8.091909,0.002167,9.925788,186.540872,4.17e-132,3.807511,Light Yellow,0.004867,0.222912,0.616574,...,3.177849,0.003296139,168.075545,Spring,15.249416,69.336671,June,29.0,7.0,0


### Incorrect categories 

In [7]:
def introduce_incorrect_categories(df, column_name, invalid_categories, category_percentage=0.05):
    """
    Introduces invalid or incorrect categories in a categorical column for a given percentage of rows.
    
    Parameters:
    - column_name: The column where invalid categories will be introduced
    - invalid_categories: List of invalid categories to be introduced
    - category_percentage: Percentage of rows to introduce invalid categories (default: 5%)
    """
    num_invalid = int(len(df) * category_percentage)
    invalid_indices = np.random.choice(df.index, num_invalid, replace=False)
    df.loc[invalid_indices, column_name] = np.random.choice(invalid_categories, num_invalid)
    print(f"Introduced incorrect categories in {column_name}: {num_invalid} rows")
    return df

# Specify incorrect categories for the 'Color' column
incorrect_colors = ["Bluish Green", "Transparent"]

# Introduce incorrect categories in the 'Color' column
df = introduce_incorrect_categories(df, "Color", incorrect_colors, category_percentage=0.05)
df.head()  # Preview the dataset with incorrect categories


Introduced incorrect categories in Color: 52428 rows


Unnamed: 0,pH,Iron,Nitrate,Chloride,Lead,Zinc,Color,Turbidity,Fluoride,Copper,...,Chlorine,Manganese,Total Dissolved Solids,Source,Water Temperature,Air Temperature,Month,Day,Time of Day,Target
0,8.332988,8.3e-05,8.605777,122.799772,3.71e-52,3.434827,Colorless,0.022683,0.607283,0.144599,...,3.708178,2.27e-15,332.118789,,,43.493324,,29.0,4.0,0
1,6.917863,8.1e-05,3.734167,227.029851,7.85e-94,1.245317,Faint Yellow,0.019007,0.622874,0.437835,...,3.292038,8.02e-07,284.641984,Lake,15.348981,71.220586,November,26.0,16.0,0
2,5.443762,0.020106,3.816994,230.99563,5.29e-76,0.52828,Light Yellow,0.319956,0.423423,0.431588,...,3.560224,0.07007989,570.054094,River,11.643467,44.89133,January,31.0,8.0,0
3,7.955339,0.143988,8.224944,178.12994,4e-176,4.027879,Bluish Green,0.166319,0.208454,0.239451,...,3.516907,0.02468295,100.043838,Ground,10.092392,60.843233,April,1.0,21.0,0
4,8.091909,0.002167,9.925788,186.540872,4.17e-132,3.807511,Light Yellow,0.004867,0.222912,0.616574,...,3.177849,0.003296139,168.075545,Spring,15.249416,69.336671,June,29.0,7.0,0


### Unrealistic values in the datast

In [8]:
def introduce_unrealistic_values(df):
    """
    Introduces unrealistic values like negative or excessively high values for certain columns.
    
    Parameters:
    - Introduces errors such as negative water temperature, extremely high air temperature, and negative turbidity
    """
    # Adding unrealistic values for water temperature, air temperature, and turbidity
    temp_indices = np.random.choice(df.index, int(len(df) * 0.03), replace=False)
    df.loc[temp_indices, "Water Temperature"] = -5  # Negative water temperature (logical error)

    air_temp_indices = np.random.choice(df.index, int(len(df) * 0.03), replace=False)
    df.loc[air_temp_indices, "Air Temperature"] = 150  # Extremely high air temperature (logical error)

    turbidity_indices = np.random.choice(df.index, int(len(df) * 0.03), replace=False)
    df.loc[turbidity_indices, "Turbidity"] = -1  # Negative turbidity (invalid value)

    print(f"Introduced unrealistic values in Water Temperature, Air Temperature, and Turbidity.")
    return df

# Introduce unrealistic values
df = introduce_unrealistic_values(df)
df.head()  # Preview the dataset with unrealistic values


Introduced unrealistic values in Water Temperature, Air Temperature, and Turbidity.


Unnamed: 0,pH,Iron,Nitrate,Chloride,Lead,Zinc,Color,Turbidity,Fluoride,Copper,...,Chlorine,Manganese,Total Dissolved Solids,Source,Water Temperature,Air Temperature,Month,Day,Time of Day,Target
0,8.332988,8.3e-05,8.605777,122.799772,3.71e-52,3.434827,Colorless,0.022683,0.607283,0.144599,...,3.708178,2.27e-15,332.118789,,,43.493324,,29.0,4.0,0
1,6.917863,8.1e-05,3.734167,227.029851,7.85e-94,1.245317,Faint Yellow,0.019007,0.622874,0.437835,...,3.292038,8.02e-07,284.641984,Lake,15.348981,71.220586,November,26.0,16.0,0
2,5.443762,0.020106,3.816994,230.99563,5.29e-76,0.52828,Light Yellow,0.319956,0.423423,0.431588,...,3.560224,0.07007989,570.054094,River,11.643467,44.89133,January,31.0,8.0,0
3,7.955339,0.143988,8.224944,178.12994,4e-176,4.027879,Bluish Green,0.166319,0.208454,0.239451,...,3.516907,0.02468295,100.043838,Ground,10.092392,60.843233,April,1.0,21.0,0
4,8.091909,0.002167,9.925788,186.540872,4.17e-132,3.807511,Light Yellow,0.004867,0.222912,0.616574,...,3.177849,0.003296139,168.075545,Spring,15.249416,69.336671,June,29.0,7.0,0


### Invalid Date values in the dataset 

In [9]:
def introduce_invalid_dates(df):
    """
    Introduces invalid date/time values, such as incorrect month, day, or time.
    
    Parameters:
    - Invalid months (13-15), invalid days (32-40), and invalid hours (25-30)
    """
    # Invalid months (13-15)
    month_indices = np.random.choice(df.index, int(len(df) * 0.02), replace=False)
    df.loc[month_indices, "Month"] = np.random.choice(["13", "14", "15"], len(month_indices))

    # Invalid days (32-40)
    day_indices = np.random.choice(df.index, int(len(df) * 0.02), replace=False)
    df.loc[day_indices, "Day"] = np.random.choice(range(32, 40), len(day_indices))

    # Invalid hours (25-30)
    time_indices = np.random.choice(df.index, int(len(df) * 0.02), replace=False)
    df.loc[time_indices, "Time of Day"] = np.random.choice(range(25, 30), len(time_indices))

    print(f"Introduced invalid dates and times: {len(month_indices)} months, {len(day_indices)} days, {len(time_indices)} times")
    return df

# Introduce invalid date and time values
df = introduce_invalid_dates(df)
df.head()  # Preview the dataset with invalid dates and times


Introduced invalid dates and times: 20971 months, 20971 days, 20971 times


Unnamed: 0,pH,Iron,Nitrate,Chloride,Lead,Zinc,Color,Turbidity,Fluoride,Copper,...,Chlorine,Manganese,Total Dissolved Solids,Source,Water Temperature,Air Temperature,Month,Day,Time of Day,Target
0,8.332988,8.3e-05,8.605777,122.799772,3.71e-52,3.434827,Colorless,0.022683,0.607283,0.144599,...,3.708178,2.27e-15,332.118789,,,43.493324,,29.0,4.0,0
1,6.917863,8.1e-05,3.734167,227.029851,7.85e-94,1.245317,Faint Yellow,0.019007,0.622874,0.437835,...,3.292038,8.02e-07,284.641984,Lake,15.348981,71.220586,November,26.0,16.0,0
2,5.443762,0.020106,3.816994,230.99563,5.29e-76,0.52828,Light Yellow,0.319956,0.423423,0.431588,...,3.560224,0.07007989,570.054094,River,11.643467,44.89133,13,31.0,8.0,0
3,7.955339,0.143988,8.224944,178.12994,4e-176,4.027879,Bluish Green,0.166319,0.208454,0.239451,...,3.516907,0.02468295,100.043838,Ground,10.092392,60.843233,April,1.0,21.0,0
4,8.091909,0.002167,9.925788,186.540872,4.17e-132,3.807511,Light Yellow,0.004867,0.222912,0.616574,...,3.177849,0.003296139,168.075545,Spring,15.249416,69.336671,June,29.0,7.0,0


### Generating inconsistent values

In [10]:
def introduce_inconsistent_units(df, column_name, unit_percentage=0.05):
    """
    Introduces inconsistent units for a given column, such as mixing Celsius and Fahrenheit for temperature.
    
    Parameters:
    - column_name: The column where inconsistent units will be introduced
    - unit_percentage: Percentage of rows to introduce inconsistent units (default: 5%)
    """
    num_inconsistent = int(len(df) * unit_percentage)
    inconsistent_indices = np.random.choice(df.index, num_inconsistent, replace=False)
    
    # Convert some values to a different unit (e.g., Celsius to Fahrenheit or vice versa)
    for index in inconsistent_indices:
        if column_name == "Water Temperature":
            # Randomly convert Celsius to Fahrenheit
            if random.choice([True, False]):
                df.loc[index, column_name] = df.loc[index, column_name] * 9/5 + 32  # Convert to Fahrenheit
            else:
                df.loc[index, column_name] = (df.loc[index, column_name] - 32) * 5/9  # Convert to Celsius
    
    print(f"Introduced inconsistent units in {column_name}: {num_inconsistent} rows")
    return df

# Introduce inconsistent units for the 'Water Temperature' column
df = introduce_inconsistent_units(df, "Water Temperature", unit_percentage=0.05)
df.head()  # Preview the dataset with inconsistent units


Introduced inconsistent units in Water Temperature: 52428 rows


Unnamed: 0,pH,Iron,Nitrate,Chloride,Lead,Zinc,Color,Turbidity,Fluoride,Copper,...,Chlorine,Manganese,Total Dissolved Solids,Source,Water Temperature,Air Temperature,Month,Day,Time of Day,Target
0,8.332988,8.3e-05,8.605777,122.799772,3.71e-52,3.434827,Colorless,0.022683,0.607283,0.144599,...,3.708178,2.27e-15,332.118789,,,43.493324,,29.0,4.0,0
1,6.917863,8.1e-05,3.734167,227.029851,7.85e-94,1.245317,Faint Yellow,0.019007,0.622874,0.437835,...,3.292038,8.02e-07,284.641984,Lake,59.628165,71.220586,November,26.0,16.0,0
2,5.443762,0.020106,3.816994,230.99563,5.29e-76,0.52828,Light Yellow,0.319956,0.423423,0.431588,...,3.560224,0.07007989,570.054094,River,11.643467,44.89133,13,31.0,8.0,0
3,7.955339,0.143988,8.224944,178.12994,4e-176,4.027879,Bluish Green,0.166319,0.208454,0.239451,...,3.516907,0.02468295,100.043838,Ground,10.092392,60.843233,April,1.0,21.0,0
4,8.091909,0.002167,9.925788,186.540872,4.17e-132,3.807511,Light Yellow,0.004867,0.222912,0.616574,...,3.177849,0.003296139,168.075545,Spring,15.249416,69.336671,June,29.0,7.0,0


### Saving the dataset with issues

In [11]:
# Save the dataset with errors to a CSV file
df.to_csv("Water_data_with_errors.csv", index=False)