In [1]:
import pandas as pd
import numpy as np
import random

In [2]:
file_path = "churn.csv"
df = pd.read_csv(file_path)

In [3]:
def infect_data(df):
    df_copy = df.copy()

    # 1. Inject Missing Values
    def inject_missing_values(df):
        num_missing = int(df.size * 0.001)
        for _ in range(num_missing):
            ix = (random.randint(0, df.shape[0]-1), random.randint(0, df.shape[1]-1))
            df.iat[ix] = np.nan
        return df

    # 2. Inject Incorrect Data Types
    def inject_incorrect_data_types(df):
        for col in df.select_dtypes(include=['number']).columns:
            df[col] = df[col].apply(lambda x: str(x) if random.random() < 0.001 else x)
        return df

    # 3. Inject Duplicate Rows
    def inject_duplicate_rows(df):
        num_dup = int(df.shape[0] * 0.001)
        dup_rows = df.sample(num_dup, replace=True)
        df = pd.concat([df, dup_rows], ignore_index=True)
        return df

    # 4. Inject Outliers
    def inject_outliers(df):
        for col in df.select_dtypes(include=['number']).columns:
            df[col] = df[col].apply(lambda x: x*10 if random.random() < 0.001 else x)
        return df

    # 5. Inject Inconsistent Data Formats
    def inject_inconsistent_formats(df):
        for col in df.select_dtypes(include=['object']).columns:
            if col == 'SeniorCitizen':
                pass
            else:
                df[col] = df[col].apply(lambda x: f"{x}-badformat" if random.random() < 0.001 else x)
        return df

    # 6. Inject Invalid Range Values
    def inject_invalid_range(df):
        for col in df.columns:
            if "age" in col.lower():
                df[col] = df[col].apply(lambda x: -x if random.random() < 0.001 else x)
        return df

    # 7. Inject Unexpected Categorical Values
    def inject_unexpected_categories(df):
        for col in df.select_dtypes(include=['object']).columns:
            unique_vals = df[col].unique().tolist()
            if len(unique_vals) > 2:  # For categorical columns
                df[col] = df[col].apply(lambda x: "InvalidCategory" if random.random() < 0.001 else x)
        return df

    # Apply all infections
    df_copy = inject_missing_values(df_copy)
    df_copy = inject_incorrect_data_types(df_copy)
    df_copy = inject_duplicate_rows(df_copy)
    df_copy = inject_outliers(df_copy)
    df_copy = inject_inconsistent_formats(df_copy)
    df_copy = inject_invalid_range(df_copy)
    df_copy = inject_unexpected_categories(df_copy)

    return df_copy

In [4]:
infected_df = infect_data(df)

In [5]:
infected_df

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0.0,Yes,No,1.0,No,No phone service,DSL,No-badformat,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0.0,No,No,34.0,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0.0,No,No,2.0,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0.0,No,No,45.0,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0.0,No,No,2.0,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7045,7036-ZZKBD,Male,0.0,Yes,No,31.0,Yes,No,DSL,No,...,Yes,Yes,No,Yes,Month-to-month,Yes,Credit card (automatic),66.4,2019.8,No
7046,3785-NRHYR,Male,0.0,No,No,1.0,Yes,No,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Month-to-month,No,Mailed check,19.65,19.65,No
7047,8975-SKGRX,Male,0.0,Yes,No,71.0,Yes,Yes,Fiber optic,Yes,...,Yes,Yes,Yes,Yes,Two year,Yes,Bank transfer (automatic),116.1,8310.55,No
7048,6458-CYIDZ,Female,1.0,No,No,5.0,Yes,No,Fiber optic,Yes,...,No,No,No,No,Month-to-month,No,Electronic check,80.7,374.8,No


In [6]:
infected_df['SeniorCitizen'] = pd.to_numeric(infected_df['SeniorCitizen'], errors='coerce')
infected_df['tenure'] = pd.to_numeric(infected_df['tenure'], errors='coerce')
infected_df['MonthlyCharges'] = pd.to_numeric(infected_df['MonthlyCharges'], errors='coerce')
infected_df['TotalCharges'] = pd.to_numeric(infected_df['TotalCharges'], errors='coerce')

In [7]:
# Save the infected dataset
infected_df.to_csv("../data/infected_dataset.csv", index=False)

# Display a sample of the infected data
infected_df


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0.0,Yes,No,1.0,No,No phone service,DSL,No-badformat,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0.0,No,No,34.0,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.50,No
2,3668-QPYBK,Male,0.0,No,No,2.0,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0.0,No,No,45.0,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.30,1840.75,No
4,9237-HQITU,Female,0.0,No,No,2.0,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7045,7036-ZZKBD,Male,0.0,Yes,No,31.0,Yes,No,DSL,No,...,Yes,Yes,No,Yes,Month-to-month,Yes,Credit card (automatic),66.40,2019.80,No
7046,3785-NRHYR,Male,0.0,No,No,1.0,Yes,No,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Month-to-month,No,Mailed check,19.65,19.65,No
7047,8975-SKGRX,Male,0.0,Yes,No,71.0,Yes,Yes,Fiber optic,Yes,...,Yes,Yes,Yes,Yes,Two year,Yes,Bank transfer (automatic),116.10,8310.55,No
7048,6458-CYIDZ,Female,1.0,No,No,5.0,Yes,No,Fiber optic,Yes,...,No,No,No,No,Month-to-month,No,Electronic check,80.70,374.80,No
