### Importing required libraries

In [1]:
# Importing Libraries
import pandas as pd  
import numpy as np  
import matplotlib.pyplot as plt
import seaborn as sns  
from pandas.api.types import is_numeric_dtype
import joblib 
import random

### Importing dataset 

In [2]:
# Importing Dataset
df = pd.read_csv("/Users/brahmareddy/Desktop/ml_pipeline_project/data/raw_data/data_for_issues.csv")


In [3]:
df['Color'].unique()

array([0, 1, 2, 3, 4, 5])

In [4]:
df.head()

Unnamed: 0,pH,Iron,Nitrate,Chloride,Lead,Zinc,Color,Turbidity,Fluoride,Copper,...,Conductivity,Chlorine,Manganese,Total Dissolved Solids,Source,Water Temperature,Air Temperature,Month,Day,Time of Day
0,8.33,0.0,8.61,122.8,3.71e-52,3.43,0,0.02,0.61,0.14,...,471.68,3.71,2.27e-15,332.12,8.0,7.449868,43.49,4.0,29.0,4.0
1,6.92,0.0,3.73,227.03,7.85e-94,1.25,1,0.02,0.62,0.44,...,432.84,3.29,0.0,284.64,2.0,15.348981,71.22,9.0,26.0,16.0
2,5.44,0.02,3.82,231.0,5.29e-76,0.53,2,0.32,0.42,0.43,...,990.2,3.56,0.07,570.05,4.0,11.643467,44.89,4.0,31.0,8.0
3,7.96,0.14,8.22,178.13,4e-176,4.03,3,0.17,0.21,0.24,...,237.03,3.52,0.02,100.04,1.0,10.092392,60.84,0.0,1.0,21.0
4,8.09,0.0,9.93,186.54,4.17e-132,3.81,2,0.0,0.22,0.62,...,385.03,3.18,0.0,168.08,5.0,15.249416,69.34,6.0,29.0,7.0


### Generating  Missing Values in 

In [5]:
def generate_missing_data(df, column_names, missing_percentage=0.1):
    """
    Introduce missing values in the given columns for a specified percentage of rows.
    """
    for column in column_names:
        # Calculating how many missing values to introduce
        num_missing = int(len(df) * missing_percentage)
        missing_rows = np.random.choice(df.index, num_missing, replace=False)
        df.loc[missing_rows, column] = np.nan
    return df

# showing the columns where missing data will be introduced
columns_with_missing_data = ["pH", "Iron", "Source", "Month"]

# Add missing values
df = generate_missing_data(df, columns_with_missing_data, missing_percentage=0.1)
df.head()


Unnamed: 0,pH,Iron,Nitrate,Chloride,Lead,Zinc,Color,Turbidity,Fluoride,Copper,...,Conductivity,Chlorine,Manganese,Total Dissolved Solids,Source,Water Temperature,Air Temperature,Month,Day,Time of Day
0,8.33,0.0,8.61,122.8,3.71e-52,3.43,0,0.02,0.61,0.14,...,471.68,3.71,2.27e-15,332.12,8.0,7.449868,43.49,4.0,29.0,4.0
1,6.92,0.0,3.73,227.03,7.85e-94,1.25,1,0.02,0.62,0.44,...,432.84,3.29,0.0,284.64,2.0,15.348981,71.22,9.0,26.0,16.0
2,5.44,0.02,3.82,231.0,5.29e-76,0.53,2,0.32,0.42,0.43,...,990.2,3.56,0.07,570.05,4.0,11.643467,44.89,4.0,31.0,8.0
3,7.96,0.14,8.22,178.13,4e-176,4.03,3,0.17,0.21,0.24,...,237.03,3.52,0.02,100.04,1.0,10.092392,60.84,0.0,1.0,21.0
4,8.09,0.0,9.93,186.54,4.17e-132,3.81,2,0.0,0.22,0.62,...,385.03,3.18,0.0,168.08,5.0,15.249416,69.34,6.0,29.0,7.0


### Checking for Outliers

In [6]:
def introduce_outliers(df, column_names, outlier_percentage=0.05):
    """
    Introduces extreme outlier values (outliers) in the specified columns for a given percentage of rows.
    
    Parameters:
    - column_names: List of column names where outliers will be introduced
    - outlier_percentage: Percentage of rows to introduce outliers (default: 5%)
    """
    for column in column_names:
        num_outliers = int(len(df) * outlier_percentage)
        outlier_indices = np.random.choice(df.index, num_outliers, replace=False)
        df.loc[outlier_indices, column] = df[column].mean() + 10 * df[column].std() 
        print(f"Introduced outliers in {column}: {num_outliers} rows")
    return df

# Specify columns to introduce outliers
columns_with_outliers = ["pH", "Iron", "Lead"]

# Introducing outliers
df = introduce_outliers(df, columns_with_outliers, outlier_percentage=0.05)
df.head()  


Introduced outliers in pH: 20364 rows
Introduced outliers in Iron: 20364 rows
Introduced outliers in Lead: 20364 rows


Unnamed: 0,pH,Iron,Nitrate,Chloride,Lead,Zinc,Color,Turbidity,Fluoride,Copper,...,Conductivity,Chlorine,Manganese,Total Dissolved Solids,Source,Water Temperature,Air Temperature,Month,Day,Time of Day
0,8.33,0.0,8.61,122.8,5.001763,3.43,0,0.02,0.61,0.14,...,471.68,3.71,2.27e-15,332.12,8.0,7.449868,43.49,4.0,29.0,4.0
1,16.214978,0.0,3.73,227.03,7.85e-94,1.25,1,0.02,0.62,0.44,...,432.84,3.29,0.0,284.64,2.0,15.348981,71.22,9.0,26.0,16.0
2,5.44,0.02,3.82,231.0,5.29e-76,0.53,2,0.32,0.42,0.43,...,990.2,3.56,0.07,570.05,4.0,11.643467,44.89,4.0,31.0,8.0
3,7.96,0.14,8.22,178.13,4e-176,4.03,3,0.17,0.21,0.24,...,237.03,3.52,0.02,100.04,1.0,10.092392,60.84,0.0,1.0,21.0
4,8.09,0.0,9.93,186.54,4.17e-132,3.81,2,0.0,0.22,0.62,...,385.03,3.18,0.0,168.08,5.0,15.249416,69.34,6.0,29.0,7.0


### Incorrect categories 

In [7]:
def introduce_incorrect_categories(df, column_name, invalid_numeric_codes, category_percentage=0.05):
    """
    Introduces invalid numeric codes in a categorical (encoded) column for a given percentage of rows.

    Parameters:
    - df: DataFrame
    - column_name: The column where invalid codes will be introduced
    - invalid_numeric_codes: List of invalid numeric codes to insert
    - category_percentage: Percentage of rows to introduce invalid codes (default: 5%)
    """
    num_invalid = int(len(df) * category_percentage)
    invalid_indices = np.random.choice(df.index, num_invalid, replace=False)
    df.loc[invalid_indices, column_name] = np.random.choice(invalid_numeric_codes, num_invalid)
    print(f"Introduced incorrect numeric codes in '{column_name}': {num_invalid} rows")
    return df

# Example usage:
invalid_color_codes = [-1, 99]  # Values not in 0–5 range

# Inject invalid numeric codes into 'Color'
df = introduce_incorrect_categories(df, "Color", invalid_color_codes, category_percentage=0.05)

# Preview the changes
df.head()

Introduced incorrect numeric codes in 'Color': 20364 rows


Unnamed: 0,pH,Iron,Nitrate,Chloride,Lead,Zinc,Color,Turbidity,Fluoride,Copper,...,Conductivity,Chlorine,Manganese,Total Dissolved Solids,Source,Water Temperature,Air Temperature,Month,Day,Time of Day
0,8.33,0.0,8.61,122.8,5.001763,3.43,0,0.02,0.61,0.14,...,471.68,3.71,2.27e-15,332.12,8.0,7.449868,43.49,4.0,29.0,4.0
1,16.214978,0.0,3.73,227.03,7.85e-94,1.25,1,0.02,0.62,0.44,...,432.84,3.29,0.0,284.64,2.0,15.348981,71.22,9.0,26.0,16.0
2,5.44,0.02,3.82,231.0,5.29e-76,0.53,2,0.32,0.42,0.43,...,990.2,3.56,0.07,570.05,4.0,11.643467,44.89,4.0,31.0,8.0
3,7.96,0.14,8.22,178.13,4e-176,4.03,3,0.17,0.21,0.24,...,237.03,3.52,0.02,100.04,1.0,10.092392,60.84,0.0,1.0,21.0
4,8.09,0.0,9.93,186.54,4.17e-132,3.81,2,0.0,0.22,0.62,...,385.03,3.18,0.0,168.08,5.0,15.249416,69.34,6.0,29.0,7.0


### Unrealistic values in the datast

In [8]:
def introduce_unrealistic_values(df):
    """
    Introduces unrealistic values like negative or excessively high values for certain columns.
    
    Parameters:
    - Introduces errors such as negative water temperature, extremely high air temperature, and negative turbidity
    """
    # Adding unrealistic values for water temperature, air temperature, and turbidity
    temp_indices = np.random.choice(df.index, int(len(df) * 0.03), replace=False)
    df.loc[temp_indices, "Water Temperature"] = -5  # Negative water temperature (logical error)

    air_temp_indices = np.random.choice(df.index, int(len(df) * 0.03), replace=False)
    df.loc[air_temp_indices, "Air Temperature"] = 150  # Extremely high air temperature (logical error)

    turbidity_indices = np.random.choice(df.index, int(len(df) * 0.03), replace=False)
    df.loc[turbidity_indices, "Turbidity"] = -1  # Negative turbidity (invalid value)

    print(f"Introduced unrealistic values in Water Temperature, Air Temperature, and Turbidity.")
    return df

# Introduce unrealistic values
df = introduce_unrealistic_values(df)
df.head()  # Preview the dataset with unrealistic values


Introduced unrealistic values in Water Temperature, Air Temperature, and Turbidity.


Unnamed: 0,pH,Iron,Nitrate,Chloride,Lead,Zinc,Color,Turbidity,Fluoride,Copper,...,Conductivity,Chlorine,Manganese,Total Dissolved Solids,Source,Water Temperature,Air Temperature,Month,Day,Time of Day
0,8.33,0.0,8.61,122.8,5.001763,3.43,0,0.02,0.61,0.14,...,471.68,3.71,2.27e-15,332.12,8.0,7.449868,43.49,4.0,29.0,4.0
1,16.214978,0.0,3.73,227.03,7.85e-94,1.25,1,0.02,0.62,0.44,...,432.84,3.29,0.0,284.64,2.0,15.348981,71.22,9.0,26.0,16.0
2,5.44,0.02,3.82,231.0,5.29e-76,0.53,2,0.32,0.42,0.43,...,990.2,3.56,0.07,570.05,4.0,11.643467,44.89,4.0,31.0,8.0
3,7.96,0.14,8.22,178.13,4e-176,4.03,3,0.17,0.21,0.24,...,237.03,3.52,0.02,100.04,1.0,10.092392,60.84,0.0,1.0,21.0
4,8.09,0.0,9.93,186.54,4.17e-132,3.81,2,0.0,0.22,0.62,...,385.03,3.18,0.0,168.08,5.0,15.249416,69.34,6.0,29.0,7.0


### Invalid Date values in the dataset 

In [9]:
def introduce_invalid_dates(df):
    """
    Introduces invalid date/time values, such as incorrect month, day, or time.
    
    Parameters:
    - Invalid months (13-15), invalid days (32-40), and invalid hours (25-30)
    """
    # Invalid months (13-15)
    month_indices = np.random.choice(df.index, int(len(df) * 0.02), replace=False)
    df.loc[month_indices, "Month"] = np.random.choice(["13", "14", "15"], len(month_indices))

    # Invalid days (32-40)
    day_indices = np.random.choice(df.index, int(len(df) * 0.02), replace=False)
    df.loc[day_indices, "Day"] = np.random.choice(range(32, 40), len(day_indices))

    # Invalid hours (25-30)
    time_indices = np.random.choice(df.index, int(len(df) * 0.02), replace=False)
    df.loc[time_indices, "Time of Day"] = np.random.choice(range(25, 30), len(time_indices))

    print(f"Introduced invalid dates and times: {len(month_indices)} months, {len(day_indices)} days, {len(time_indices)} times")
    return df

# Introduce invalid date and time values
df = introduce_invalid_dates(df)
df.head()  # Preview the dataset with invalid dates and times


Introduced invalid dates and times: 8145 months, 8145 days, 8145 times


  df.loc[month_indices, "Month"] = np.random.choice(["13", "14", "15"], len(month_indices))


Unnamed: 0,pH,Iron,Nitrate,Chloride,Lead,Zinc,Color,Turbidity,Fluoride,Copper,...,Conductivity,Chlorine,Manganese,Total Dissolved Solids,Source,Water Temperature,Air Temperature,Month,Day,Time of Day
0,8.33,0.0,8.61,122.8,5.001763,3.43,0,0.02,0.61,0.14,...,471.68,3.71,2.27e-15,332.12,8.0,7.449868,43.49,13.0,29.0,4.0
1,16.214978,0.0,3.73,227.03,7.85e-94,1.25,1,0.02,0.62,0.44,...,432.84,3.29,0.0,284.64,2.0,15.348981,71.22,9.0,26.0,16.0
2,5.44,0.02,3.82,231.0,5.29e-76,0.53,2,0.32,0.42,0.43,...,990.2,3.56,0.07,570.05,4.0,11.643467,44.89,4.0,31.0,8.0
3,7.96,0.14,8.22,178.13,4e-176,4.03,3,0.17,0.21,0.24,...,237.03,3.52,0.02,100.04,1.0,10.092392,60.84,0.0,1.0,21.0
4,8.09,0.0,9.93,186.54,4.17e-132,3.81,2,0.0,0.22,0.62,...,385.03,3.18,0.0,168.08,5.0,15.249416,69.34,6.0,29.0,7.0


### Generating inconsistent values

In [10]:
def introduce_inconsistent_units(df, column_name, unit_percentage=0.05):
    """
    Introduces inconsistent units for a given column, such as mixing Celsius and Fahrenheit for temperature.
    
    Parameters:
    - column_name: The column where inconsistent units will be introduced
    - unit_percentage: Percentage of rows to introduce inconsistent units (default: 5%)
    """
    num_inconsistent = int(len(df) * unit_percentage)
    inconsistent_indices = np.random.choice(df.index, num_inconsistent, replace=False)
    
    # Convert some values to a different unit (e.g., Celsius to Fahrenheit or vice versa)
    for index in inconsistent_indices:
        if column_name == "Water Temperature":
            # Randomly convert Celsius to Fahrenheit
            if random.choice([True, False]):
                df.loc[index, column_name] = df.loc[index, column_name] * 9/5 + 32  # Convert to Fahrenheit
            else:
                df.loc[index, column_name] = (df.loc[index, column_name] - 32) * 5/9  # Convert to Celsius
    
    print(f"Introduced inconsistent units in {column_name}: {num_inconsistent} rows")
    return df

# Introduce inconsistent units for the 'Water Temperature' column
df = introduce_inconsistent_units(df, "Water Temperature", unit_percentage=0.05)
df.head()  # Preview the dataset with inconsistent units


Introduced inconsistent units in Water Temperature: 20364 rows


Unnamed: 0,pH,Iron,Nitrate,Chloride,Lead,Zinc,Color,Turbidity,Fluoride,Copper,...,Conductivity,Chlorine,Manganese,Total Dissolved Solids,Source,Water Temperature,Air Temperature,Month,Day,Time of Day
0,8.33,0.0,8.61,122.8,5.001763,3.43,0,0.02,0.61,0.14,...,471.68,3.71,2.27e-15,332.12,8.0,7.449868,43.49,13.0,29.0,4.0
1,16.214978,0.0,3.73,227.03,7.85e-94,1.25,1,0.02,0.62,0.44,...,432.84,3.29,0.0,284.64,2.0,15.348981,71.22,9.0,26.0,16.0
2,5.44,0.02,3.82,231.0,5.29e-76,0.53,2,0.32,0.42,0.43,...,990.2,3.56,0.07,570.05,4.0,11.643467,44.89,4.0,31.0,8.0
3,7.96,0.14,8.22,178.13,4e-176,4.03,3,0.17,0.21,0.24,...,237.03,3.52,0.02,100.04,1.0,10.092392,60.84,0.0,1.0,21.0
4,8.09,0.0,9.93,186.54,4.17e-132,3.81,2,0.0,0.22,0.62,...,385.03,3.18,0.0,168.08,5.0,15.249416,69.34,6.0,29.0,7.0


### Saving the dataset with issues

In [11]:
# Save the dataset with errors to a CSV file
df.to_csv("Water_data_with_errors.csv", index=False)