In [12]:
import pandas as pd
import gzip
from xgboost import XGBRegressor, XGBClassifier
from scipy.stats.mstats import winsorize

In [13]:
def get_data() -> pd.DataFrame:
    dfs = []

    for i in range(1, 11):
        base_url = f"dataset/raw/capstone.{i}.jsonl.gz"

        with gzip.open(base_url.format(str(i)), mode="r") as f:
            df_batch = pd.read_json(f, lines=True)
            dfs.append(df_batch)

    df = pd.concat(dfs, ignore_index=True)
    return df

In [14]:
TARGET = "churn"

In [15]:
df = get_data()

In [16]:
def insert_apps(df: pd.DataFrame):
    apps = set()
    for val in df["apps"]:
        for list_val in val:
            apps.add(list_val)

    for app in apps:
        df[f"is_using_{app}"] = 0

    for app in apps:
        df[f"is_using_{app}"] = df["apps"].apply(lambda x: 1 if app in x else 0)

    return df


In [17]:
def preprocess_service_type(df: pd.DataFrame) -> pd.DataFrame:
    service_type_mapping = {"Prepaid": 1, "Postpaid": 2, "Broadband": 3}
    df["service_type"] = df["service_type"].map(service_type_mapping)
    return df

In [18]:
def preprocess_churn(df: pd.DataFrame):
    churn_mapping = {False: 0, True: 1}
    df["churn"] = df["churn"].map(churn_mapping)
    return df


In [19]:
def impute_with_xgboost(df: pd.DataFrame, column_to_impute: str, job: str = "regression") -> pd.DataFrame:
    empty_df = df[df[column_to_impute].isna()].drop(columns=column_to_impute)

    X_train = df[df[column_to_impute].notna()].drop(columns=column_to_impute)
    y_train = df.loc[df[column_to_impute].notna(), column_to_impute]

    if job == "regression":
        xgb = XGBRegressor()
    elif job == "classification":
        xgb = XGBClassifier()
        
    model = xgb.fit(X_train, y_train)
    pred = model.predict(empty_df)
    df.loc[df[column_to_impute].isna(), column_to_impute] = pd.to_numeric(pred.flatten())
    
    return df

In [None]:
def winsorize_outliers(df: pd.DataFrame, iqr_multiplier: float = 1.5) -> pd.DataFrame:
    for column in df.select_dtypes(include=["number"]).columns:
        if column == TARGET:
            continue
        
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1

        lower_bound = Q1 - iqr_multiplier * IQR
        upper_bound = Q3 + iqr_multiplier * IQR

        df[column] = winsorize(
            df[column],
            limits=[(df[column] < lower_bound).mean(), (df[column] > upper_bound).mean()],
        )

    return df

In [21]:
df = (
    df
    .pipe(insert_apps)
    .drop(columns=["id", "apps"])
    .drop_duplicates()
    .pipe(preprocess_service_type)
    .pipe(preprocess_churn)
    .pipe(impute_with_xgboost, column_to_impute="avg_call_duration")
    .pipe(impute_with_xgboost, column_to_impute="roaming_usage")
    .pipe(impute_with_xgboost, column_to_impute="auto_payment", job="classification")
    .pipe(impute_with_xgboost, column_to_impute="call_drops")
    .assign(**{
        "data_usage": lambda df: df["data_usage"].fillna(df["data_usage"].mean()),
        "tenure": lambda df: df["tenure"].fillna(df["tenure"].mean()),
        "monthly_charge": lambda df: df["monthly_charge"].fillna(df["monthly_charge"].median())
    })
    .pipe(winsorize_outliers)
    .astype({col: "int32" for col in df.select_dtypes("int64").columns})
    .astype({col: "float32" for col in df.select_dtypes("float64").columns})
 )

In [24]:
df.to_csv("dataset/interim/cleaned_dataset.csv.gz", compression="gzip", index=False)