In [1]:
import pandas as pd
import numpy as np
import random

In [2]:
file_path = "../data/churn.csv"
df = pd.read_csv(file_path)

In [3]:
def infect_data(df):
    df_copy = df.copy()

    # 1. Inject Missing Values
    def inject_missing_values(df):
        num_missing = int(df.size * 0.05)
        for _ in range(num_missing):
            ix = (random.randint(0, df.shape[0]-1), random.randint(0, df.shape[1]-1))
            df.iat[ix] = np.nan
        return df

    # 2. Inject Incorrect Data Types
    def inject_incorrect_data_types(df):
        for col in df.select_dtypes(include=['number']).columns:
            df[col] = df[col].apply(lambda x: str(x) if random.random() < 0.05 else x)
        return df

    # 3. Inject Duplicate Rows
    def inject_duplicate_rows(df):
        num_dup = int(df.shape[0] * 0.05)
        dup_rows = df.sample(num_dup, replace=True)
        df = pd.concat([df, dup_rows], ignore_index=True)
        return df

    # 4. Inject Outliers
    def inject_outliers(df):
        for col in df.select_dtypes(include=['number']).columns:
            df[col] = df[col].apply(lambda x: x*10 if random.random() < 0.05 else x)
        return df

    # 5. Inject Inconsistent Data Formats
    def inject_inconsistent_formats(df):
        for col in df.select_dtypes(include=['object']).columns:
            df[col] = df[col].apply(lambda x: f"{x}-badformat" if random.random() < 0.05 else x)
        return df

    # 6. Inject Invalid Range Values
    def inject_invalid_range(df):
        for col in df.columns:
            if "age" in col.lower():
                df[col] = df[col].apply(lambda x: -x if random.random() < 0.05 else x)
        return df

    # 7. Inject Unexpected Categorical Values
    def inject_unexpected_categories(df):
        for col in df.select_dtypes(include=['object']).columns:
            unique_vals = df[col].unique().tolist()
            if len(unique_vals) > 2:  # For categorical columns
                df[col] = df[col].apply(lambda x: "InvalidCategory" if random.random() < 0.05 else x)
        return df

    # Apply all infections
    df_copy = inject_missing_values(df_copy)
    df_copy = inject_incorrect_data_types(df_copy)
    df_copy = inject_duplicate_rows(df_copy)
    df_copy = inject_outliers(df_copy)
    df_copy = inject_inconsistent_formats(df_copy)
    df_copy = inject_invalid_range(df_copy)
    df_copy = inject_unexpected_categories(df_copy)

    return df_copy

In [4]:
infected_df = infect_data(df)

In [6]:
infected_df

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0.0,Yes,No,,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85-badformat,No
1,5575-GNVDE,Male,0.0,No,No,34.0,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No-badformat,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0.0,No,No,InvalidCategory,Yes,InvalidCategory,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,,0.0,No,No,InvalidCategory,No,No phone service,DSL,Yes,...,Yes-badformat,Yes,No,No,InvalidCategory,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0.0,,No,2.0,Yes,No,Fiber optic,No,...,No,No,No,No,,Yes,Electronic check,70.7-badformat,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7390,5893-KCLGT,Female,0.0,No,Yes,72.0,Yes,No,No,No internet service,...,No internet service,No internet service,No internet service-badformat,No internet service,Two year,Yes,Mailed check,19.75,1567,No
7391,0567-XRHCU,Female,0.0,Yes,Yes,69.0-badformat,No,No phone service,DSL,Yes,...,Yes,InvalidCategory,No,Yes,Two year,Yes,Credit card (automatic),43.95,2960.1,No
7392,8167-GJLRN,Male,0.0,No,No,3.0,No,No phone service,DSL,No,...,No,Yes,No,No,Month-to-month,No-badformat,Electronic check,30.4-badformat,82.15,No
7393,2450-ZKEED,Female,0.0,No,No,11.0,Yes,No,DSL-badformat,No,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),InvalidCategory,651.55-badformat,No


In [7]:
# Save the infected dataset
infected_df.to_csv("../data/infected_dataset.csv", index=False)

# Display a sample of the infected data
infected_df.sample(10)


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
419,5564-NEMQO,,1.0,nan-badformat,No,InvalidCategory,Yes,No,Fiber optic,No-badformat,...,Yes,No,No,InvalidCategory,Month-to-month,Yes,Bank transfer (automatic),75.3,75.3,Yes
1743,3768-VHXQO,Male,0.0,Yes,No,67.0,Yes,,No,No internet service,...,No internet service,InvalidCategory,No internet service,No internet service,Two year-badformat,No,Credit card (automatic),24.85,1583.5,No
4473,2665-NPTGL,Female,1.0,Yes,No,26.0,Yes-badformat,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,Month-to-month,Yes,Bank transfer (automatic)-badformat,98.1,2510.7,No
3431,2430-USGXP,Male,0.0,Yes,No,24.0,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,Month-to-month,Yes,Electronic check,101.05,,Yes
2740,0415-MOSGF,Female,0.0,No,No,1.0,Yes,No,DSL,No,...,No,No,No,No,Month-to-month,,Mailed check,,44.4,Yes
1614,3813-DHBBB,Male,0.0,Yes,No,67.0,No,No phone service,DSL,Yes,...,No-badformat,Yes,No,,Two year,Yes,Bank transfer (automatic),50.95,3521.7,No
7353,9172-ANCRX,Female,0.0,No,No,10.0,Yes,Yes,Fiber optic,Yes,...,No,No,Yes,Yes-badformat,Month-to-month,Yes,Electronic check,98.7,,Yes
1434,1571-SAVHK,Male,0.0,No,No,12.0,Yes,Yes,InvalidCategory,No,...,No,Yes,InvalidCategory,Yes,Month-to-month,Yes,Mailed check,99.95,1132.75,Yes
1337,6265-FRMTQ,,0.0,No,No,31.0,Yes,No,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Two year-badformat,No-badformat,Bank transfer (automatic),InvalidCategory,609.1-badformat,No
34,3413-BMNZE,Male,1.0,No,No,InvalidCategory,,InvalidCategory,DSL,No,...,No,InvalidCategory,No,No,Month-to-month,InvalidCategory,Bank transfer (automatic),45.25,45.25,No
