In [17]:
import pandas as pd
import numpy as np
import random

In [18]:
file_path = "churn.csv"
df = pd.read_csv(file_path)

In [19]:
def infect_data(df):
    df_copy = df.copy()

    # 1. Inject Missing Values
    def inject_missing_values(df):
        num_missing = int(df.size * 0.05)
        for _ in range(num_missing):
            ix = (random.randint(0, df.shape[0]-1), random.randint(0, df.shape[1]-1))
            df.iat[ix] = np.nan
        return df

    # 2. Inject Incorrect Data Types
    def inject_incorrect_data_types(df):
        for col in df.select_dtypes(include=['number']).columns:
            df[col] = df[col].apply(lambda x: str(x) if random.random() < 0.05 else x)
        return df

    # 3. Inject Duplicate Rows
    def inject_duplicate_rows(df):
        num_dup = int(df.shape[0] * 0.05)
        dup_rows = df.sample(num_dup, replace=True)
        df = pd.concat([df, dup_rows], ignore_index=True)
        return df

    # 4. Inject Outliers
    def inject_outliers(df):
        for col in df.select_dtypes(include=['number']).columns:
            df[col] = df[col].apply(lambda x: x*10 if random.random() < 0.05 else x)
        return df

    # 5. Inject Inconsistent Data Formats
    def inject_inconsistent_formats(df):
        for col in df.select_dtypes(include=['object']).columns:
            if col == 'SeniorCitizen':
                pass
            else:
                df[col] = df[col].apply(lambda x: f"{x}-badformat" if random.random() < 0.05 else x)
        return df

    # 6. Inject Invalid Range Values
    def inject_invalid_range(df):
        for col in df.columns:
            if "age" in col.lower():
                df[col] = df[col].apply(lambda x: -x if random.random() < 0.05 else x)
        return df

    # 7. Inject Unexpected Categorical Values
    def inject_unexpected_categories(df):
        for col in df.select_dtypes(include=['object']).columns:
            unique_vals = df[col].unique().tolist()
            if len(unique_vals) > 2:  # For categorical columns
                df[col] = df[col].apply(lambda x: "InvalidCategory" if random.random() < 0.05 else x)
        return df

    # Apply all infections
    df_copy = inject_missing_values(df_copy)
    df_copy = inject_incorrect_data_types(df_copy)
    df_copy = inject_duplicate_rows(df_copy)
    df_copy = inject_outliers(df_copy)
    df_copy = inject_inconsistent_formats(df_copy)
    df_copy = inject_invalid_range(df_copy)
    df_copy = inject_unexpected_categories(df_copy)

    return df_copy

In [20]:
infected_df = infect_data(df)

In [21]:
infected_df

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0.0,Yes,No,1.0,No,No phone service,DSL,No,...,No,No,,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male-badformat,0.0,No,No-badformat,34.0,nan-badformat,No,,Yes,...,Yes,No,No,No,One year,,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0.0,No,No,2.0,Yes,No,DSL,Yes,...,No,No,,No,Month-to-month,Yes,InvalidCategory,53.85,108.15,Yes
3,InvalidCategory,Male,0.0,No,,45.0,No,InvalidCategory,DSL,Yes,...,Yes,Yes,InvalidCategory,InvalidCategory,One year,No,Bank transfer (automatic),42.3,,No
4,9237-HQITU,InvalidCategory,0.0,No-badformat,No,2.0,Yes,No,Fiber optic,InvalidCategory,...,No,No,No,No,,Yes,Electronic check,,InvalidCategory,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7390,0953-LGOVU,Male,0.0,InvalidCategory,Yes,12.0,No,No phone service,DSL,No,...,No,Yes,No-badformat,No,Month-to-month,No,Mailed check,35.5,432.25,No
7391,InvalidCategory,Male,1.0,Yes,No,38.0,No,No phone service-badformat,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,24.85,955.75,No
7392,9360-AHGNL,Female,1.0,Yes,No,,Yes,Yes,Fiber optic,Yes,...,Yes,Yes,Yes-badformat,Yes,One year,Yes,Mailed check,109.55,4830.25,Yes
7393,1543-LLLFT,Male,1.0,Yes,No-badformat,61.0,Yes,Yes,Fiber optic,No,...,Yes,Yes-badformat,Yes,No,One year,Yes,Mailed check,98.3,6066.55,No


In [22]:
infected_df['SeniorCitizen'] = pd.to_numeric(infected_df['SeniorCitizen'], errors='coerce')

In [24]:
# Save the infected dataset
infected_df.to_csv("../data/infected_dataset.csv", index=False)

# Display a sample of the infected data
infected_df


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0.0,Yes,No,1.0,No,No phone service,DSL,No,...,No,No,,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male-badformat,0.0,No,No-badformat,34.0,nan-badformat,No,,Yes,...,Yes,No,No,No,One year,,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0.0,No,No,2.0,Yes,No,DSL,Yes,...,No,No,,No,Month-to-month,Yes,InvalidCategory,53.85,108.15,Yes
3,InvalidCategory,Male,0.0,No,,45.0,No,InvalidCategory,DSL,Yes,...,Yes,Yes,InvalidCategory,InvalidCategory,One year,No,Bank transfer (automatic),42.3,,No
4,9237-HQITU,InvalidCategory,0.0,No-badformat,No,2.0,Yes,No,Fiber optic,InvalidCategory,...,No,No,No,No,,Yes,Electronic check,,InvalidCategory,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7390,0953-LGOVU,Male,0.0,InvalidCategory,Yes,12.0,No,No phone service,DSL,No,...,No,Yes,No-badformat,No,Month-to-month,No,Mailed check,35.5,432.25,No
7391,InvalidCategory,Male,1.0,Yes,No,38.0,No,No phone service-badformat,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,24.85,955.75,No
7392,9360-AHGNL,Female,1.0,Yes,No,,Yes,Yes,Fiber optic,Yes,...,Yes,Yes,Yes-badformat,Yes,One year,Yes,Mailed check,109.55,4830.25,Yes
7393,1543-LLLFT,Male,1.0,Yes,No-badformat,61.0,Yes,Yes,Fiber optic,No,...,Yes,Yes-badformat,Yes,No,One year,Yes,Mailed check,98.3,6066.55,No
