In [None]:
import dask.dataframe as dd
import pandas as pd

In [None]:
data = dd.read_parquet('../data/new_selected_data.parquet', engine="pyarrow")

In [None]:
data = data.compute()

In [None]:
X = data.drop(['target', 'id'], axis=1)
y = data['target']

In [None]:
from sklearn.metrics import roc_auc_score, f1_score, recall_score, precision_score
def get_metrics(y_test, y_pred):
  roc_auc = roc_auc_score(y_test, y_pred)
  f1 = f1_score(y_test, y_pred)
  recall = recall_score(y_test, y_pred)
  precision = precision_score(y_test, y_pred)

  print(f"ROC AUC: {roc_auc}")
  print(f"F1 Score: {f1}")
  print(f"Recall: {recall}")
  print(f"Precision: {precision}")

## Baseline
В качестве baseline будем использовать модель, которая прогнозирует значение 1

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

baseline_model = np.random.choice([0, 1], size=len(y_test), p=[0.5, 0.5])
get_metrics(y_test, baseline_model)

ROC AUC: 0.49346711363504697
F1 Score: 0.0646072458737777
Recall: 0.4878886478669559
Precision: 0.03459413732552327


In [None]:
data['target'].compute().value_counts()

0    501078
1     18537
Name: target, dtype: int64

In [None]:
import numpy as np
data = np.load('../data/test_data.npz')
X_test, y_test= data['X_test'], data['y_test']

In [None]:
import joblib
scaler = joblib.load('scaler_80_20.joblib')
pca = joblib.load('pca_80_20.joblib')


X_test_scaled = scaler.transform(X_test)
X_test_pca = pca.transform(X_test_scaled)



## Тестирование моделей

In [None]:
loaded_log_model = joblib.load('best_logistic_regression_model_50_50.joblib')

predict = loaded_log_model.predict(X_test_pca)
get_metrics(y_test, predict)

In [None]:
#случайный лес: Логистический
loaded_tree_model = joblib.load('best_tree_model.joblib')

pred = loaded_tree_model.predict_proba(X_test_pca)

pred = pred[:, 1]
pred_binary = (pred >= 0.1)

In [None]:
best_threshold = 0.5
best_f1_score = 0.0

for threshold in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]:
    pred_binary = (pred >= threshold)
    f1 = roc_auc_score(y_test, pred_binary)
    if f1 > best_f1_score:
        best_f1_score = f1
        best_threshold = threshold

print(f"Best F1 Score: {best_f1_score} at threshold {best_threshold}")
pred_binary = (pred >= best_threshold)
get_metrics(y_test, pred_binary)

Best F1 Score: 0.6968242544988091 at threshold 0.5
ROC AUC: 0.6968242544988091
F1 Score: 0.13868392511068045
Recall: 0.7049891540130152
Precision: 0.0769063911182978


In [None]:
#регрессионный лес
loaded_tree_model = joblib.load('best_tree_model_80_20.joblib')

pred = loaded_tree_model.predict(X_test_pca)

In [None]:
best_threshold = 0.5
best_f1_score = 0.0

for threshold in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]:
    pred_binary = (pred >= threshold)
    f1 = roc_auc_score(y_test, pred_binary)
    if f1 > best_f1_score:
        best_f1_score = f1
        best_threshold = threshold

print(f"Best F1 Score: {best_f1_score} at threshold {best_threshold}")
pred_binary = (pred >= best_threshold)
get_metrics(y_test, pred_binary)

Best F1 Score: 0.6776861781658501 at threshold 0.2
ROC AUC: 0.6776861781658501
F1 Score: 0.13132388965811057
Recall: 0.6686550976138829
Precision: 0.07281209401204677


In [None]:
%pip install catboost

Installing collected packages: catboost
Successfully installed catboost-1.2.3


In [None]:
#CatBoost
from catboost import CatBoostClassifier

In [None]:
from_file = CatBoostClassifier()

from_file.load_model("best_catboost_model_80_20_new")
pred = from_file.predict_proba(X_test_pca)

pred = pred[:, 1]
pred_binary = (pred >= 0.1)

In [None]:
best_threshold = 0.5
best_f1_score = 0.0

for threshold in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]:
    pred_binary = (pred >= threshold)
    f1 = roc_auc_score(y_test, pred_binary)
    if f1 > best_f1_score:
        best_f1_score = f1
        best_threshold = threshold

print(f"Best F1 Score: {best_f1_score} at threshold {best_threshold}")
pred_binary = (pred >= best_threshold)
get_metrics(y_test, pred_binary)

Best F1 Score: 0.705217946150433 at threshold 0.2
ROC AUC: 0.705217946150433
F1 Score: 0.1456308861802318
Recall: 0.703000723065799
Precision: 0.08122898260124903


In [None]:
#Нейросеть
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
loaded_model = tf.keras.models.load_model('neuron_model_80_20.h5')


y_pred_loaded = loaded_model.predict(X_test_pca)
y_pred_binary = (y_pred_loaded > 0.6).astype(int)
get_metrics(y_test, y_pred_binary)

ROC AUC: 0.5111151306547577
F1 Score: 0.04587304151187207
Recall: 0.02566883586406363
Precision: 0.21547799696509864


In [None]:
y_pred_binary = (y_pred_loaded > 0.2).astype(int)
get_metrics(y_test, y_pred_binary)

ROC AUC: 0.6613933095813724
F1 Score: 0.12151420812081871
Recall: 0.663232104121475
Precision: 0.06688420592095669


В целовой переменной наблюдается сильный дисбаланс классов

## Downsampling

In [None]:
from sklearn.utils import resample
class_0 = data[data['target'] == 0]
class_1 = data[data['target'] == 1]

In [None]:
#одинаковое распределение - 50/50
class_0_downsampled = resample(class_0, replace=False, n_samples=len(class_1), random_state=42)

balanced_data = pd.concat([class_0_downsampled, class_1])

balanced_data = balanced_data.sample(frac=1, random_state=42)

In [None]:
balanced_data.shape

(37074, 707)

In [None]:
balanced_data.to_parquet('balanced_data.parquet', engine='pyarrow')

## Изначальное распределение
Уменьшим размер выборки до 40000 строк с распределением целевой переменной 96/4



In [None]:
target_ratio = 0.04
class_0_downsampled_2 = resample(class_0, replace=False, n_samples=int(40000 * (1 - target_ratio)), random_state=42)
class_1_downsampled_2 = resample(class_1, replace=False, n_samples=int(40000 * target_ratio), random_state=42)

balanced_data_2 = pd.concat([class_0_downsampled_2, class_1_downsampled_2])
balanced_data_2 = balanced_data_2.sample(frac=1, random_state=42)

In [None]:
balanced_data_2['target'].value_counts(1)

0    0.96
1    0.04
Name: target, dtype: float64

In [None]:
balanced_data_2.to_parquet('balanced_data_96_4.parquet', engine='pyarrow')

## Выборка с распределением 65/35

In [None]:
target_ratio = 0.35
class_0_downsampled_3 = resample(class_0, replace=False, n_samples=int(40000 * (1 - target_ratio)), random_state=42)
class_1_downsampled_3 = resample(class_1, replace=False, n_samples=int(40000 * target_ratio), random_state=42)

balanced_data_3 = pd.concat([class_0_downsampled_3, class_1_downsampled_3])
balanced_data_3 = balanced_data_3.sample(frac=1, random_state=42)

In [None]:
balanced_data_3['target'].value_counts(1)

0    0.65
1    0.35
Name: target, dtype: float64

In [None]:
balanced_data_3.to_parquet('balanced_data_65_35.parquet', engine='pyarrow')

Выборка с распределением 80/20. Значения с единицой убирать не будем

In [None]:
target_ratio = 0.2
class_0_downsampled_4 = resample(class_0, replace=False, n_samples=int(92685 * (1 - target_ratio)), random_state=42)
class_1_downsampled_4 = resample(class_1, replace=False, n_samples=int(92685 * target_ratio), random_state=42)

balanced_data_4 = pd.concat([class_0_downsampled_4, class_1_downsampled_4])
balanced_data_4 = balanced_data_4.sample(frac=1, random_state=42)

In [None]:
balanced_data_4['target'].value_counts()

0    74148
1    18537
Name: target, dtype: int64

In [None]:
balanced_data_4.to_parquet('balanced_data_80_20.parquet', engine='pyarrow')

Пробуем еще уменьшить количество колонок на датасете 80/20

In [None]:
balanced_data_80_20 = pd.read_parquet('/content/drive/MyDrive/DF-ChurnPrediction/data/balanced_data_80_20.parquet')

In [None]:
corr_with_target = balanced_data_80_20.corrwith(balanced_data_80_20['target']).abs()

threshold = 0.02

low_corr_features = corr_with_target[corr_with_target < threshold].index

In [None]:
corr_with_target.sort_values()

feature436    0.000092
feature441    0.000117
id            0.000170
feature40     0.000275
feature866    0.000523
                ...   
feature943    0.179502
feature953    0.195863
feature944    0.198325
feature954    0.210768
target        1.000000
Length: 707, dtype: float64

In [None]:
print(list(low_corr_features))


['id', 'feature2', 'feature37', 'feature38', 'feature39', 'feature40', 'feature45', 'feature49', 'feature53', 'feature58', 'feature59', 'feature71', 'feature75', 'feature76', 'feature78', 'feature84', 'feature86', 'feature95', 'feature99', 'feature106', 'feature110', 'feature111', 'feature112', 'feature113', 'feature115', 'feature116', 'feature125', 'feature127', 'feature133', 'feature137', 'feature139', 'feature143', 'feature145', 'feature147', 'feature152', 'feature153', 'feature155', 'feature161', 'feature162', 'feature163', 'feature164', 'feature166', 'feature167', 'feature169', 'feature170', 'feature171', 'feature173', 'feature174', 'feature175', 'feature176', 'feature194', 'feature195', 'feature199', 'feature200', 'feature218', 'feature219', 'feature220', 'feature221', 'feature222', 'feature223', 'feature260', 'feature268', 'feature275', 'feature286', 'feature288', 'feature290', 'feature303', 'feature304', 'feature305', 'feature336', 'feature342', 'feature345', 'feature346', 'fea

In [None]:
balanced_data_80_20 = balanced_data_80_20.drop(low_corr_features, axis=1)

In [None]:
percentage_df = pd.DataFrame({
    'top_value_percentage': balanced_data_80_20.apply(lambda col: col.value_counts(normalize=True).max() * 100)
})

sorted_percentage_df = percentage_df.sort_values(by='top_value_percentage', ascending=False)
sorted_percentage_df

Unnamed: 0,top_value_percentage
feature48,95.822409
feature372,95.822409
feature412,95.618493
feature74,95.618493
feature805,95.612019
...,...
feature190,0.026973
feature191,0.026973
feature226,0.024815
feature319,0.023736


In [None]:
balanced_data_80_20['feature226'].value_counts()

685298    23
0         17
90113      3
340594     2
286777     2
          ..
654944     1
157187     1
171643     1
462722     1
467556     1
Name: feature226, Length: 92502, dtype: int64

In [None]:
maybe_to_delete_a_lot_percentage = sorted_percentage_df[sorted_percentage_df['top_value_percentage'] > 80]

In [None]:
correlation_cols = maybe_to_delete_a_lot_percentage.index.tolist() + ['target']

correlation_a_lot_target = balanced_data_80_20[correlation_cols]

In [None]:
corr_with_target_a_lot = correlation_a_lot_target.corrwith(correlation_a_lot_target['target']).abs()

In [None]:
selected_columns = corr_with_target_a_lot[corr_with_target_a_lot < 0.05].index.tolist()

In [None]:
print(selected_columns)

['feature48', 'feature372', 'feature412', 'feature74', 'feature805', 'feature716', 'feature718', 'feature717', 'feature728', 'feature848', 'feature337', 'feature767', 'feature294', 'feature741', 'feature932', 'feature417', 'feature79', 'feature382', 'feature828', 'feature331', 'feature706', 'feature557', 'feature204', 'feature44', 'feature416', 'feature373', 'feature1045', 'feature1046', 'feature1047', 'feature1048', 'feature413', 'feature878', 'feature383', 'feature544', 'feature732', 'feature888', 'feature335', 'feature840', 'feature703', 'feature339', 'feature852', 'feature52', 'feature376', 'feature384', 'feature788', 'feature659', 'feature456', 'feature935', 'feature400', 'feature758', 'feature755', 'feature838', 'feature334', 'feature291', 'feature764', 'feature1036', 'feature687', 'feature538', 'feature995', 'feature308', 'feature781', 'feature377', 'feature1063', 'feature1070', 'feature693', 'feature1064', 'feature1071', 'feature1056', 'feature1065', 'feature1072', 'feature415'

In [None]:
balanced_data_80_20 = balanced_data_80_20.drop(selected_columns, axis=1)

In [None]:
maybe_to_delete_a_small_percentage = sorted_percentage_df[sorted_percentage_df['top_value_percentage'] < 1]

In [None]:
correlation_cols = maybe_to_delete_a_small_percentage.index.tolist() + ['target']

correlation_a_small_target = balanced_data_80_20[correlation_cols]

In [None]:
corr_with_target_a_small = correlation_a_small_target.corrwith(correlation_a_small_target['target']).abs()

In [None]:
corr_with_target_a_small

feature187    0.114151
feature942    0.157084
feature212    0.172941
feature213    0.172941
feature188    0.112885
feature314    0.112885
feature943    0.179502
feature863    0.049529
feature853    0.049728
feature351    0.049930
feature930    0.152475
feature944    0.198325
feature323    0.026437
feature322    0.026437
feature911    0.026465
feature136    0.077248
feature121    0.073552
feature572    0.027317
feature142    0.079053
feature126    0.075525
feature533    0.028919
feature181    0.095068
feature898    0.050520
feature896    0.144755
feature909    0.145540
feature320    0.145590
feature532    0.075164
feature952    0.179395
feature951    0.166667
feature207    0.081787
feature206    0.081894
feature4      0.154964
feature934    0.140450
feature953    0.195863
feature201    0.079977
feature189    0.135147
feature190    0.143867
feature191    0.145038
feature226    0.111392
feature319    0.132404
feature954    0.210768
target        1.000000
dtype: float64

In [None]:
selected_columns = corr_with_target_a_small[corr_with_target_a_small < 0.05].index.tolist()

In [None]:
print(selected_columns)

['feature863', 'feature853', 'feature351', 'feature323', 'feature322', 'feature911', 'feature572', 'feature533']


In [None]:
balanced_data_80_20 = balanced_data_80_20.drop(selected_columns, axis=1)

## Удаление высококоррелярных признаков

In [None]:
corr_matrix = balanced_data_80_20.corr().abs()

In [None]:
# порог корреляции
threshold = 0.8

# Пары признаков с корреляцией выше порога
high_corr_pairs = [(i, j) for i in range(corr_matrix.shape[0]) for j in range(i + 1, corr_matrix.shape[1]) if abs(corr_matrix.iloc[i, j]) > threshold]

columns_to_drop = set()

In [None]:
for pair in high_corr_pairs:
    column1, column2 = pair
    corr_with_target1 = abs(corr_matrix.iloc[column1, 0])
    corr_with_target2 = abs(corr_matrix.iloc[column2, 0])

    # Оставляем признак с более высокой корреляцией к целевой переменной
    if corr_with_target1 > corr_with_target2:
        columns_to_drop.add(column2)
    else:
        columns_to_drop.add(column1)

In [None]:
dropped_multy_columns = balanced_data_80_20.columns[list(columns_to_drop)].tolist()

In [None]:
print(dropped_multy_columns)

['feature3', 'feature4', 'feature5', 'feature7', 'feature8', 'feature9', 'feature10', 'feature11', 'feature13', 'feature14', 'feature15', 'feature16', 'feature17', 'feature18', 'feature19', 'feature21', 'feature24', 'feature25', 'feature29', 'feature30', 'feature32', 'feature46', 'feature47', 'feature50', 'feature51', 'feature54', 'feature55', 'feature62', 'feature63', 'feature67', 'feature88', 'feature89', 'feature90', 'feature96', 'feature98', 'feature101', 'feature102', 'feature103', 'feature104', 'feature105', 'feature108', 'feature109', 'feature114', 'feature118', 'feature119', 'feature120', 'feature121', 'feature122', 'feature126', 'feature129', 'feature130', 'feature134', 'feature135', 'feature136', 'feature138', 'feature141', 'feature142', 'feature148', 'feature149', 'feature150', 'feature151', 'feature156', 'feature157', 'feature165', 'feature172', 'feature177', 'feature180', 'feature181', 'feature182', 'feature185', 'feature187', 'feature188', 'feature189', 'feature190', 'fea

In [None]:
balanced_data_80_20_filtered = balanced_data_80_20.drop(dropped_multy_columns, axis =1)

In [None]:
balanced_data_80_20_filtered.to_parquet('balanced_data_80_20_new.parquet', engine='pyarrow')