Эксперименты, Пайплайн обработки данных, Построение модели
-

Baseline: чистим, кодируем, нормируем, обучаем, оцениваем.

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
#from category_encoders import CatBoostEncoder

df = pd.read_csv('dataset/train_ver2.csv')
df = df.drop(['tipodom'], axis=1)
df_last_month = df[df['fecha_dato'] == "2016-05-28"]
products = [x for x in df.columns if '_ult1' in x]

def baseline_data_transform(df: pd.DataFrame):
    
    def remove_outliers(data: pd.DataFrame, outliers_columns: list):
        num_cols = outliers_columns
        threshold = 1.5
        #potential_outliers = pd.DataFrame()

        for col in num_cols:
            Q1 = data[col].quantile(0.25)
            Q3 = data[col].quantile(0.75)
            IQR = Q3 - Q1
            margin = threshold * IQR
            lower = Q1 - margin
            upper = Q3 + margin
            #potential_outliers[col] = ~data[col].between(lower, upper)
            data = data[data[col].between(lower, upper)]
        #return potential_outliers
        return data

    # берём только последний месяц
    df = df[df['fecha_dato'] == "2016-05-28"]

    # удалить столбцы
    df = df.drop(['fecha_dato', 'ncodpers', 'ult_fec_cli_1t', 'conyuemp'], axis=1)

    # удалить строки
    for col in df.columns.to_list():
        if col not in ['renta']:
            df = df[df[col].notna()]
    
    # заполнить пропуски в ренте нулями
    df['renta'] = df['renta'].fillna(0.0)

    # age строки перевести в int, почистить от ' NA'
    #TODO
    df['age'] = df['age'].astype(int)

    # fecha_alta перевести в int число секунд с начала эпохи
    df['fecha_alta'] = pd.to_datetime(df['fecha_alta']).astype(int) / 10**9
    
    # antiguedad строки конвертировать в int, и почистить от значений типа '     NA', -999999.
    #TODO
    df = df[~df['antiguedad'] < 0]
    df['antiguedad'] = df['antiguedad'].astype(int)

    # indrel_1mes намешаны разные типы, надо привести к одному типу 2.0 '2.0' '2' итп
    #TODO
    # решил в baseline удалить столбец
    df = df.drop(['indrel_1mes'], axis=1)

    # убираем выбросы
    df = remove_outliers(df, ['age', 'renta', 'fecha_alta'])
    
    return df

data = baseline_data_transform(df_last_month)

  df = pd.read_csv('dataset/train_ver2.csv')


In [3]:
X_tr, X_val, y_tr, y_val = train_test_split(
    data.drop(products, axis=1),
    data[products],
    #stratify=data[products],
    random_state=42
)

categ_binary_vals = ['sexo', 'ind_nuevo', 'indrel', 'indresi', 'indext', 'indfall', 'ind_actividad_cliente']
categ_vals = ['ind_empleado', 'pais_residencia', 'tiprel_1mes', 'canal_entrada', 'cod_prov', 'nomprov', 'segmento'] # indrel_1mes
numeric_vals = ['age', 'fecha_alta', 'antiguedad', 'renta']

# категориальные признаки решил закодировать через признак renta чтобы не было утечки
def encode_cat_via_renta(df: pd.DataFrame):
    global_mean = df['renta'].mean()
    alpha = 100  # Чем больше alpha, тем сильнее сглаживание к global_mean
    for cat in categ_vals:
        # Формула: (mean_in_category * n_samples + global_mean * alpha) / (n_samples + alpha)
        category_stats = df.groupby(cat)['renta'].agg(['mean', 'count'])
        encoded_values = (
            (category_stats['mean'] * category_stats['count'] + global_mean * alpha) / 
            (category_stats['count'] + alpha)
        )
        # Заменяем категориальные значения на закодированные
        df[cat] = df[cat].map(encoded_values)
    scaler = StandardScaler()
    # нормируем
    df[categ_vals] = scaler.fit_transform(df[categ_vals])
    return df

X_tr = encode_cat_via_renta(X_tr)
X_val = encode_cat_via_renta(X_val)

preprocessor = ColumnTransformer([
        ('binary', OneHotEncoder(drop='if_binary'), categ_binary_vals),
        #('cat', ?, categ_vals),
        ('num', StandardScaler(), numeric_vals),
    ],
    verbose_feature_names_out=False,
    remainder='passthrough'
)

X_tr_transformed = preprocessor.fit_transform(X_tr, y_tr)
X_val_transformed = preprocessor.transform(X_val)

In [59]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

model = MultiOutputClassifier(RandomForestClassifier(
    n_estimators=50,  # Уменьшите, если долго (начните с 50)
    verbose=1,         # Вывод лога обучения
    n_jobs=-1          # Использовать все ядра CPU
))
model.fit(X_tr_transformed, y_tr)

predictions = model.predict(X_val_transformed)
print(classification_report(y_val, predictions))

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    7.6s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    8.5s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    4.9s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    5.9s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   28.0s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   32.5s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    7.9s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   10.0s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   23.4s
[

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        23
           1       0.00      0.00      0.00         4
           2       0.78      0.81      0.79    132622
           3       0.00      0.00      0.00        71
           4       0.42      0.12      0.18     17118
           5       0.94      0.92      0.93      1784
           6       0.51      0.26      0.34      1858
           7       0.55      0.35      0.43     22299
           8       0.51      0.16      0.24      7582
           9       0.00      0.00      0.00        70
          10       0.00      0.00      0.00       242
          11       0.55      0.27      0.37      7327
          12       0.55      0.24      0.34     17592
          13       0.34      0.03      0.05      3341
          14       0.47      0.01      0.01      1055
          15       0.31      0.01      0.02      1638
          16       0.75      0.01      0.03       454
          17       0.33    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


Смотрим f1-score:
- macro avg = 0.19 - модель плохо справляется с большинством классов.
- weighted avg = 0.50 - с учетом количества в каждом классе модель справляется лучше на классах покрупнее.

In [None]:
import pickle

# Сохраняем
with open('model.pkl', 'wb') as f:
    pickle.dump(model, f)

# Загружаем
# with open('model.pkl', 'rb') as f:
#     model_loaded = pickle.load(f)

In [1]:
import pickle
# Загружаем
with open('model.pkl', 'rb') as f:
    model_loaded = pickle.load(f)

Advanced
-
TODO

Эксперименты:
- переделать датасет так, чтобы признаки n-го месяца лежали с таргетами n+1-го месяца;
- сгенерировать признак - динамика поля `renta` за последние n месяцев;
- 

In [None]:
def remove_duplicates(data):
    feature_cols = data.columns.drop('customer_id').tolist()
    is_duplicated_features = data.duplicated(subset=feature_cols, keep=False)
    data = data[~is_duplicated_features].reset_index(drop=True)
    return data

def fill_missing_values(data):

    cols_with_nans = data.isnull().sum()
    cols_with_nans = cols_with_nans[cols_with_nans > 0].index.drop('end_date')

    for col in cols_with_nans:

        if data[col].dtype in [float, int]:
            fill_value = data[col].mean()
        elif data[col].dtype == 'object':
            fill_value = data[col].mode().iloc[0]

        data[col] = data[col].fillna(fill_value)

    return data

def remove_outliers(data):
    num_cols = data.select_dtypes(['float']).columns
    threshold = 1.5
    potential_outliers = pd.DataFrame()

    for col in num_cols:
        Q1 = data[col].quantile(0.25)
        Q3 = data[col].quantile(0.75)
        IQR = Q3 - Q1
        margin = threshold * IQR
        lower = Q1 - margin
        upper = Q3 + margin
        potential_outliers[col] = ~data[col].between(lower, upper)
    return potential_outliers