In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/yapr-hack/sample_submission.csv
/kaggle/input/yapr-hack/features_types.json
/kaggle/input/yapr-hack/features_oot.parquet
/kaggle/input/yapr-hack/dataset_train.parquet


In [2]:
#Читаем доки
df=pd.read_parquet('/kaggle/input/yapr-hack/dataset_train.parquet')
test = pd.read_parquet('/kaggle/input/yapr-hack/features_oot.parquet')

In [3]:
#Так как в условии задачи тестовая выборка будет состоять только из клиентов из channel_name=3 то принято решение удалить оставшиеся,
#чтобы уменьшить объем датасета для дальнейших манипуляций
df = df[df['channel_name'] == '3']

In [4]:
#Удаляем неинформативные строки
df=df.dropna(axis=1, how='all').drop(['id','channel_name'],axis=1)

In [5]:
#Разделяем обучающую выборку на фичи и таргеты
features = df.drop(['target'],axis=1)
target = df['target']
features.shape

(209022, 2757)

In [6]:
#Заполняем пропущенные значения фичей
#Так как данные закодированы, в целях обработки с минимальным искажением данных было принято решение заполнить пропуски модой
from sklearn.impute import SimpleImputer
f_most_freq = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
features_filled = f_most_freq.fit_transform(features)
features_fill_df = pd.DataFrame(features_filled, columns=features.columns)

In [7]:
#Отбор 1000 фичей методом RFE с шагом в 150 фичей с целью перераспределения важности признаков с каждой итерацией
from sklearn.feature_selection import RFE
from lightgbm import LGBMClassifier  


model = LGBMClassifier()

rfe = RFE(model, n_features_to_select=1000, step=150, verbose=3)  

rfe.fit(features_fill_df, target)


Fitting estimator with 2757 features.
Fitting estimator with 2607 features.
Fitting estimator with 2457 features.
Fitting estimator with 2307 features.
Fitting estimator with 2157 features.
Fitting estimator with 2007 features.
Fitting estimator with 1857 features.
Fitting estimator with 1707 features.
Fitting estimator with 1557 features.
Fitting estimator with 1407 features.
Fitting estimator with 1257 features.
Fitting estimator with 1107 features.


In [8]:
#Оставляем 1000 фичей
selected_features_mask = rfe.support_
selected_features = features_fill_df.loc[:, selected_features_mask]

In [9]:
selected_features.shape

(209022, 1000)

In [10]:
#Проходим сеткой по гиперпараметрам
from sklearn.model_selection import RandomizedSearchCV
param_grid = {
    'num_leaves': list(np.arange(30, 150, 10)),
    'learning_rate': list(np.arange(0.001, 0.1, 0.01)) ,
    'max_depth': list(np.arange(3, 15, 2))  
}

lgbm = LGBMClassifier()

r_grid = RandomizedSearchCV(estimator=lgbm, param_distributions=param_grid, cv=3, scoring='roc_auc', verbose=2, n_jobs=-1)

r_grid.fit(selected_features, target)

Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] END ...learning_rate=0.001, max_depth=11, num_leaves=70; total time=  39.4s
[CV] END ...learning_rate=0.001, max_depth=11, num_leaves=70; total time=  38.7s
[CV] END ...learning_rate=0.001, max_depth=11, num_leaves=70; total time=  38.5s
[CV] END learning_rate=0.040999999999999995, max_depth=9, num_leaves=130; total time=  47.3s
[CV] END learning_rate=0.040999999999999995, max_depth=9, num_leaves=130; total time=  47.0s
[CV] END learning_rate=0.040999999999999995, max_depth=9, num_leaves=130; total time=  47.1s
[CV] END ....learning_rate=0.011, max_depth=3, num_leaves=40; total time=  27.8s
[CV] END ....learning_rate=0.011, max_depth=3, num_leaves=40; total time=  27.8s
[CV] END ....learning_rate=0.011, max_depth=3, num_leaves=40; total time=  27.8s
[CV] END learning_rate=0.05099999999999999, max_depth=11, num_leaves=90; total time=  44.6s
[CV] END learning_rate=0.05099999999999999, max_depth=11, num_leaves=90; total tim

In [11]:
# Вывод лучших гиперпараметров и их оценка по метрике
best_params = r_grid.best_params_
best_score = r_grid.best_score_

print("Лучшие гиперпараметры:", best_params)
print("Лучшая оценка (ROC AUC):", best_score)

Лучшие гиперпараметры: {'num_leaves': 40, 'max_depth': 3, 'learning_rate': 0.071}
Лучшая оценка (ROC AUC): 0.6691824853580307


In [16]:
#Обучение модели
import lightgbm as lgb
lgb_params = {
    'objective': 'binary',
    'metric': 'auc',
    'boosting_type': 'gbdt',
    'num_leaves': best_params['num_leaves'],
    'learning_rate': best_params['learning_rate'],
    'max_depth': best_params['max_depth']
}


best_lgbm = lgb.train(lgb_params, lgb.Dataset(selected_features, label=target))

[LightGBM] [Info] Number of positive: 3372, number of negative: 205650
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 90035
[LightGBM] [Info] Number of data points in the train set: 209022, number of used features: 987
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.016132 -> initscore=-4.110670
[LightGBM] [Info] Start training from score -4.110670


In [13]:
#Подготовка тестовых данных
common_columns = selected_features.columns.intersection(test.columns) 
test_prep = test[common_columns]  

In [17]:
#Предсказание на тесте и преобразование в виде индекс-вероятность исхода "1"
probabilities_test = best_lgbm.predict(test_prep)
probability_df = pd.DataFrame(data=probabilities_test)
probability_df.reset_index(inplace=True)
probability_df.columns = ['id', 'target']

In [19]:
#Сохранение данных в csv-файл 
probability_df.to_csv('submission_yapr_r1.csv', index=False)