In [None]:
import dask.dataframe as dd
import pandas as pd

In [None]:
data = dd.read_parquet('/content/drive/MyDrive/DF-ChurnPrediction/data/new_selected_data.parquet', engine="pyarrow")

In [None]:
data = data.compute()

In [None]:
X = data.drop(['target', 'id'], axis=1)
y = data['target']

In [None]:
def get_metrics(y_test, y_pred):
  roc_auc = roc_auc_score(y_test, y_pred)
  f1 = f1_score(y_test, y_pred)
  recall = recall_score(y_test, y_pred)
  precision = precision_score(y_test, y_pred)

  print(f"ROC AUC: {roc_auc}")
  print(f"F1 Score: {f1}")
  print(f"Recall: {recall}")
  print(f"Precision: {precision}")

## Baseline
В качестве baseline будем использовать модель, которая прогнозирует значение 1

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, f1_score, recall_score, precision_score

X_train, X_test, y_train, y_test = train_test_split(X.compute(), y.compute(), test_size=0.3, random_state=42)

baseline_model = np.random.choice([0, 1], size=len(y_test), p=[0.5, 0.5])
get_metrics(y_test, baseline_model)

ROC AUC: 0.500793728020122
F1 Score: 0.06646329746348963
Recall: 0.500180766449747
Precision: 0.035596665465959965


In [None]:
data['target'].compute().value_counts()

0    501078
1     18537
Name: target, dtype: int64

В целовой переменной наблюдается сильный дисбаланс классов

## Downsampling

In [None]:
from sklearn.utils import resample

class_0 = data[data['target'] == 0]
class_1 = data[data['target'] == 1]

class_0_downsampled = resample(class_0, replace=False, n_samples=len(class_1), random_state=42)

balanced_data = pd.concat([class_0_downsampled, class_1])

balanced_data = balanced_data.sample(frac=1, random_state=42)

In [None]:
balanced_data.shape

(37074, 707)

In [None]:
balanced_data.to_parquet('balanced_data.parquet', engine='pyarrow')