In [1]:
import pandas as pd
import numpy as np
import random

In [2]:
file_path = "churn.csv"
df = pd.read_csv(file_path)

In [3]:
def infect_data(df):
    df_copy = df.copy()

    # 1. Inject Missing Values
    def inject_missing_values(df):
        num_missing = int(df.size * 0.05)
        for _ in range(num_missing):
            ix = (random.randint(0, df.shape[0]-1), random.randint(0, df.shape[1]-1))
            df.iat[ix] = np.nan
        return df

    # 2. Inject Incorrect Data Types
    def inject_incorrect_data_types(df):
        for col in df.select_dtypes(include=['number']).columns:
            df[col] = df[col].apply(lambda x: str(x) if random.random() < 0.05 else x)
        return df

    # 3. Inject Duplicate Rows
    def inject_duplicate_rows(df):
        num_dup = int(df.shape[0] * 0.05)
        dup_rows = df.sample(num_dup, replace=True)
        df = pd.concat([df, dup_rows], ignore_index=True)
        return df

    # 4. Inject Outliers
    def inject_outliers(df):
        for col in df.select_dtypes(include=['number']).columns:
            df[col] = df[col].apply(lambda x: x*10 if random.random() < 0.05 else x)
        return df

    # 5. Inject Inconsistent Data Formats
    def inject_inconsistent_formats(df):
        for col in df.select_dtypes(include=['object']).columns:
            df[col] = df[col].apply(lambda x: f"{x}-badformat" if random.random() < 0.05 else x)
        return df

    # 6. Inject Invalid Range Values
    def inject_invalid_range(df):
        for col in df.columns:
            if "age" in col.lower():
                df[col] = df[col].apply(lambda x: -x if random.random() < 0.05 else x)
        return df

    # 7. Inject Unexpected Categorical Values
    def inject_unexpected_categories(df):
        for col in df.select_dtypes(include=['object']).columns:
            unique_vals = df[col].unique().tolist()
            if len(unique_vals) > 2:  # For categorical columns
                df[col] = df[col].apply(lambda x: "InvalidCategory" if random.random() < 0.05 else x)
        return df

    # Apply all infections
    df_copy = inject_missing_values(df_copy)
    df_copy = inject_incorrect_data_types(df_copy)
    df_copy = inject_duplicate_rows(df_copy)
    df_copy = inject_outliers(df_copy)
    df_copy = inject_inconsistent_formats(df_copy)
    df_copy = inject_invalid_range(df_copy)
    df_copy = inject_unexpected_categories(df_copy)

    return df_copy

In [4]:
infected_df = infect_data(df)

In [5]:
infected_df

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0.0,Yes,No,1.0,No,InvalidCategory,DSL,No,...,No,No-badformat,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male-badformat,0.0,No,No,34.0,Yes,No,DSL,,...,Yes,,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,InvalidCategory,0.0,InvalidCategory,No,2.0,Yes,No,,Yes,...,No,No,No,No,,,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0.0,No,No,45.0,No,No phone service,DSL,Yes,...,Yes,nan-badformat,No,No,One year,No,Bank transfer (automatic),42.3,,No
4,,Female,0.0,No,No,2.0,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7390,1069-QJOEE,Male,,,Yes,InvalidCategory,Yes,No,No,No internet service,...,No internet service-badformat,No internet service,No internet service,No internet service,One year,No,Electronic check,,505.45,No
7391,1698-XFZCI,Male,0.0,No,No,61.0,Yes,Yes,Fiber optic,,...,Yes,No,Yes,Yes,One year,Yes,Electronic check,107.75,6521.9,No
7392,3774-VBNXY,Female,0.0,Yes,Yes,64.0,Yes,,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,20.2,1277.75,No
7393,2207-RYYRL,Male,0.0,Yes,,52.0-badformat,Yes,Yes,,No internet service,...,InvalidCategory,No internet service,No internet service,InvalidCategory,Two year,No,Mailed check,24.55,,No


In [6]:
# Save the infected dataset
infected_df.to_csv("../data/infected_dataset.csv", index=False)

# Display a sample of the infected data
infected_df.sample(10)


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
5694,0983-TATYJ,Female,0.0,Yes,No,33.0,Yes-badformat,InvalidCategory,DSL,No,...,No,Yes,No,No,One year,Yes,Mailed check,51.45,1758.9,No
1229,9068-VPWQQ,Male,0.0,Yes,No,61.0,Yes,Yes,DSL,Yes,...,Yes,Yes,No,InvalidCategory,Two year,No,Credit card (automatic),,4729.3,No
4782,6175-IRFIT,Male,0.0,No,No,5.0,Yes,No,,No,...,No,No,No,Yes,Month-to-month,No,Mailed check,78.75,426.35,No
5933,6496-SLWHQ,Male,1.0,,No,3.0,InvalidCategory,Yes,Fiber optic,No,...,,No,Yes,Yes,Month-to-month,,Electronic check,105.0,294.45,Yes
1627,5035-BVCXS,Male-badformat,0.0,No,InvalidCategory,,Yes,No,Fiber optic,No,...,No,No,No,No,,,Mailed check,75.9,866.4,No
5605,8049-WJCLQ,Male,0.0,Yes,Yes,,Yes,No,DSL,No,...,InvalidCategory,No,,No,Month-to-month,No,Mailed check-badformat,60.2,563.5,
2963,4707-YNOQA,Female,0.0,Yes,Yes,34.0,Yes,,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,One year,InvalidCategory,Credit card (automatic),25.6,917.15-badformat,No
3350,7663-RGWBC-badformat,Male,0.0,No,No,1.0,Yes,No,DSL,No,...,No,No,No,,Month-to-month,Yes-badformat,Mailed check,44.15,44.15,Yes
5310,6171-ZTVYB,Male-badformat,0.0,Yes,No,43.0,Yes,Yes-badformat,DSL,Yes,...,No,Yes,No,No,Month-to-month,No,Bank transfer (automatic),66.25-badformat,2907.35,
2742,3948-KXDUF,Male,0.0,No,No,66.0,InvalidCategory,Yes,DSL,Yes,...,Yes,Yes,No,No,Two year,No,Bank transfer (automatic),68.75,4447.55,InvalidCategory
