In [1]:
import pandas as pd
import gzip
from xgboost import XGBRegressor, XGBClassifier
from scipy.stats.mstats import winsorize

In [2]:
def get_data() -> pd.DataFrame:
    dfs = []

    for i in range(1, 11):
        base_url = f"dataset/raw/capstone.{i}.jsonl.gz"

        with gzip.open(base_url.format(str(i)), mode="r") as f:
            df_batch = pd.read_json(f, lines=True)
            dfs.append(df_batch)

    df = pd.concat(dfs, ignore_index=True)
    return df

In [3]:
TARGET = "churn"

In [4]:
df = get_data()

In [5]:
df

Unnamed: 0,id,age,tenure,service_type,avg_call_duration,data_usage,roaming_usage,monthly_charge,overdue_payments,auto_payment,avg_top_up_count,call_drops,customer_support_calls,satisfaction_score,apps,churn
0,51893f29-e6c2-45d3-807c-e1280d3d7b90,18,53.0,Prepaid,106.74,139.72,47.31,59.45,0,,25,18.0,13,1.34,[],False
1,a568caf1-d851-4847-a9f5-20ef9017fa92,26,15.0,Prepaid,31.55,12.14,21.52,1221.65,0,,51,8.0,3,2.57,[],False
2,c611bf0e-a013-44dc-9939-bd33dab16d14,32,152.0,Postpaid,30.64,10.17,31.11,1170.45,0,1.0,0,13.0,10,8.02,[],False
3,9284b7b8-a4ef-49a7-9fa4-333954491f57,33,35.0,Prepaid,,,36.03,2418.91,0,,19,7.0,11,5.96,[RitimGo],False
4,e205b674-a6e6-43c5-aed7-9497c37c5c82,18,243.0,Prepaid,85.62,164.79,46.40,1518.19,0,,99,15.0,6,8.29,[],False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9999995,136531bb-3593-479f-8499-8036a39681b2,33,113.0,Postpaid,45.55,78.57,10.94,892.18,2,1.0,0,4.0,17,4.06,[],False
9999996,a15fabfa-78bf-4e8c-97a0-890d879606e1,23,168.0,Prepaid,6.03,185.91,55.17,2167.04,0,,64,7.0,18,5.51,[],False
9999997,4f6472b8-c271-4c03-a9be-ff325e263fe8,18,270.0,Prepaid,25.29,43.18,36.86,1492.85,0,,23,3.0,9,4.16,[],False
9999998,a9a9f76e-94dc-4878-9e09-da9da92f9e5d,40,19.0,Broadband,,23.66,,310.24,0,0.0,0,,6,6.70,[],False


In [7]:
def insert_apps(df: pd.DataFrame):
    apps = set()
    for val in df["apps"]:
        for list_val in val:
            apps.add(list_val)

    for app in apps:
        df[f"is_using_{app}"] = 0

    for app in apps:
        df[f"is_using_{app}"] = df["apps"].apply(lambda x: 1 if app in x else 0)

    return df


In [8]:
def preprocess_service_type(df: pd.DataFrame) -> pd.DataFrame:
    service_type_mapping = {"Prepaid": 1, "Postpaid": 2, "Broadband": 3}
    df["service_type"] = df["service_type"].map(service_type_mapping)
    return df

In [9]:
def preprocess_churn(df: pd.DataFrame):
    churn_mapping = {False: 0, True: 1}
    df["churn"] = df["churn"].map(churn_mapping)
    return df


In [10]:
def impute_with_xgboost(df: pd.DataFrame, column_to_impute: str, job: str = "regression") -> pd.DataFrame:
    empty_df = df[df[column_to_impute].isna()].drop(columns=column_to_impute)

    X_train = df[df[column_to_impute].notna()].drop(columns=column_to_impute)
    y_train = df.loc[df[column_to_impute].notna(), column_to_impute]

    if job == "regression":
        xgb = XGBRegressor()
    elif job == "classification":
        xgb = XGBClassifier()
        
    model = xgb.fit(X_train, y_train)
    pred = model.predict(empty_df)
    df.loc[df[column_to_impute].isna(), column_to_impute] = pd.to_numeric(pred.flatten())
    
    return df

In [11]:
def winsorize_outliers(df: pd.DataFrame, iqr_multiplier: float = 1.5) -> pd.DataFrame:
    boolean_columns = [
        "is_using_HızlıPazar",
        "is_using_İzleGo",
        "is_using_RitimGo",
        "is_using_CüzdanX",
        "is_using_Konuşalım",
    ]
    for column in df.select_dtypes(include=["number"]).columns:
        if (
            column == TARGET or
            column in boolean_columns
        ):
            continue

        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1

        lower_bound = Q1 - iqr_multiplier * IQR
        upper_bound = Q3 + iqr_multiplier * IQR

        df[column] = winsorize(
            df[column],
            limits=[
                (df[column] < lower_bound).mean(),
                (df[column] > upper_bound).mean(),
            ],
        )

    return df


In [12]:
df = (
    df
    .pipe(insert_apps)
    .drop(columns=["id", "apps"])
    .drop_duplicates()
    .pipe(preprocess_service_type)
    .pipe(preprocess_churn)
    .pipe(impute_with_xgboost, column_to_impute="avg_call_duration")
    .pipe(impute_with_xgboost, column_to_impute="roaming_usage")
    .pipe(impute_with_xgboost, column_to_impute="auto_payment", job="classification")
    .pipe(impute_with_xgboost, column_to_impute="call_drops")
    .assign(**{
        "data_usage": lambda df: df["data_usage"].fillna(df["data_usage"].mean()),
        "tenure": lambda df: df["tenure"].fillna(df["tenure"].mean()),
        "monthly_charge": lambda df: df["monthly_charge"].fillna(df["monthly_charge"].median())
    })
    .pipe(winsorize_outliers)
    .astype({col: "int32" for col in df.select_dtypes("int64").columns})
    .astype({col: "float32" for col in df.select_dtypes("float64").columns})
 )

In [16]:
df.to_csv("dataset/interim/cleaned_dataset.csv.gz", compression="gzip", index=False)