In [4]:
import pandas as pd
import numpy as np
import random

In [5]:
file_path = "churn.csv"
df = pd.read_csv(file_path)

In [6]:
def infect_data(df):
    df_copy = df.copy()

    # 1. Inject Missing Values
    def inject_missing_values(df):
        num_missing = int(df.size * 0.05)
        for _ in range(num_missing):
            ix = (random.randint(0, df.shape[0]-1), random.randint(0, df.shape[1]-1))
            df.iat[ix] = np.nan
        return df

    # 2. Inject Incorrect Data Types
    def inject_incorrect_data_types(df):
        for col in df.select_dtypes(include=['number']).columns:
            df[col] = df[col].apply(lambda x: str(x) if random.random() < 0.05 else x)
        return df

    # 3. Inject Duplicate Rows
    def inject_duplicate_rows(df):
        num_dup = int(df.shape[0] * 0.05)
        dup_rows = df.sample(num_dup, replace=True)
        df = pd.concat([df, dup_rows], ignore_index=True)
        return df

    # 4. Inject Outliers
    def inject_outliers(df):
        for col in df.select_dtypes(include=['number']).columns:
            df[col] = df[col].apply(lambda x: x*10 if random.random() < 0.05 else x)
        return df

    # 5. Inject Inconsistent Data Formats
    def inject_inconsistent_formats(df):
        for col in df.select_dtypes(include=['object']).columns:
            if col == 'SeniorCitizen':
                pass
            else:
                df[col] = df[col].apply(lambda x: f"{x}-badformat" if random.random() < 0.05 else x)
        return df

    # 6. Inject Invalid Range Values
    def inject_invalid_range(df):
        for col in df.columns:
            if "age" in col.lower():
                df[col] = df[col].apply(lambda x: -x if random.random() < 0.05 else x)
        return df

    # 7. Inject Unexpected Categorical Values
    def inject_unexpected_categories(df):
        for col in df.select_dtypes(include=['object']).columns:
            unique_vals = df[col].unique().tolist()
            if len(unique_vals) > 2:  # For categorical columns
                df[col] = df[col].apply(lambda x: "InvalidCategory" if random.random() < 0.05 else x)
        return df

    # Apply all infections
    df_copy = inject_missing_values(df_copy)
    df_copy = inject_incorrect_data_types(df_copy)
    df_copy = inject_duplicate_rows(df_copy)
    df_copy = inject_outliers(df_copy)
    df_copy = inject_inconsistent_formats(df_copy)
    df_copy = inject_invalid_range(df_copy)
    df_copy = inject_unexpected_categories(df_copy)

    return df_copy

In [7]:
infected_df = infect_data(df)

In [8]:
infected_df

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0.0,InvalidCategory,No,1.0,No,No phone service,DSL,No,...,No,No,,No,Month-to-month,Yes,InvalidCategory,29.85,29.85,No
1,,Male,0.0,No,No,34.0,Yes,No,nan-badformat,Yes,...,Yes,No,,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0.0,No,,2.0,Yes,No,,Yes,...,InvalidCategory,InvalidCategory,No,No,Month-to-month,,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male-badformat,0.0,No,No,45.0,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,InvalidCategory,Bank transfer (automatic),42.3,1840.75,No-badformat
4,9237-HQITU,Female,0.0,No,No,2.0,InvalidCategory,No,Fiber optic,InvalidCategory,...,No,No,No,No,Month-to-month,InvalidCategory,InvalidCategory,70.7,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7390,6711-FLDFB,Female,0.0,No,No,7.0,Yes,Yes,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,74.9,541.15,Yes
7391,7526-BEZQB,Male,0.0,No-badformat,No,12.0-badformat,Yes,Yes,Fiber optic,No,...,No,No,Yes,Yes,Month-to-month,Yes,Electronic check,96.05,1148.1,Yes
7392,2157-MXBJS,Male,0.0,Yes,No,13.0,Yes,Yes,DSL,No,...,Yes,No-badformat,Yes,Yes,One year,Yes,,75.3,989.45,Yes
7393,4419-UJMUS,Male,0.0,Yes,Yes,69.0,Yes,Yes,Fiber optic,No,...,No,Yes,Yes-badformat,Yes,Two year,Yes,Electronic check,99.35,6856.45,No


In [14]:
infected_df['SeniorCitizen'] = pd.to_numeric(infected_df['SeniorCitizen'], errors='coerce')

In [16]:
# Save the infected dataset
infected_df.to_csv("../data/infected_dataset.csv", index=False)

# Display a sample of the infected data
infected_df.sample(10)


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
1089,4546-FOKWR,Female,0.0,No,No,16.0,Yes,Yes,Fiber optic,,...,No,No,No,No,Month-to-month,Yes,Credit card (automatic),74.75,1129.35,No
5795,7048-GXDAY,Male-badformat,0.0,No,No,39.0,Yes,Yes,Fiber optic,Yes,...,No-badformat,No,Yes,Yes,Month-to-month,InvalidCategory,Electronic check,106.4,4040.65,No
105,6180-YBIQI,Male,0.0,No,No,5.0,No,No phone service,DSL-badformat,No,...,No,No,No,No,Month-to-month,InvalidCategory,Mailed check,24.3,100.2,No
2285,,Male,,Yes,No,53.0,Yes,Yes,Fiber optic,Yes-badformat,...,Yes-badformat,No,InvalidCategory,Yes,,No,Electronic check,93.9,5029.2,InvalidCategory
30,3841-NFECX,Female,1.0,Yes,No,71.0,Yes-badformat,Yes,,Yes,...,Yes,Yes,No,No,Two year,Yes,Credit card (automatic),InvalidCategory,6766.95,No
725,6549-YMFAW,Male,,Yes,No,9.0-badformat,Yes,No,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Month-to-month,Yes,Mailed check,21.25,204.55,No
14,5129-JLPIS,Male-badformat,0.0,No,No,25.0,Yes,No,Fiber optic,Yes,...,Yes,,Yes,Yes,Month-to-month,Yes,Electronic check-badformat,105.5,2686.05,No-badformat
726,4950-HKQTE,Female,0.0,No,No,22.0,Yes,Yes,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Two year,No,InvalidCategory,26.25,605.9,No
5845,4581-SSPWD,Female,0.0,No,No,3.0,Yes,Yes,Fiber optic,No,...,No,,No,,Month-to-month,Yes,Electronic check,75.8-badformat,246.3,InvalidCategory
2741,1724-BQUHA,Male,1.0,InvalidCategory,No,5.0,Yes,No-badformat,Fiber optic,No,...,No,No,Yes,Yes,Month-to-month,Yes,Electronic check,89.5,477.7,Yes
