In [1]:
import numpy as np
import pandas as pd
# from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import copy
import math
import lightgbm as lgb
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pickle
import random
from sklearn.metrics import roc_auc_score
import optuna
import catboost as cb
import warnings
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
warnings.filterwarnings('ignore')
from tqdm.auto import tqdm

In [31]:
import lightgbm as lgb
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle
from sklearn.metrics import roc_auc_score
import optuna
from tqdm import tqdm

class Boosting:
    def __init__(self, X_train, X_val, y_train, y_val, cat_features, params=None):
        print("Init...")
        self.X_train = X_train
        self.X_val = X_val
        self.y_train = y_train
        self.y_val = y_val
        self.cat_features = cat_features
        self.params = params
        self.model = None
        self.best_params = None
        self.top_features = None
        print("Init Finished!")

    def train(self):
        print("Training...")
        train_data = lgb.Dataset(self.X_train, label=self.y_train, categorical_feature=self.cat_features)
        val_data = lgb.Dataset(self.X_val, label=self.y_val, categorical_feature=self.cat_features, reference=train_data)

        if self.params is None:
            params = {
                'objective': 'multiclass',
                'num_class': len(np.unique(self.y_train)),
                'metric': 'multi_logloss',
                'verbosity': 10,
                'boosting_type': 'gbdt',
                'num_leaves': 31,
                'learning_rate': 0.2,
                'max_depth': 6,
                'feature_fraction': 0.9,
                'bagging_fraction': 0.8,
                'bagging_freq': 5,
                'num_threads': -1
            }
        else:
            params = self.params

        print("Training with params:", params)

        # Define a callback function to print iteration number
        def print_iteration_num(env):
            print("Iteration:", env.iteration)

        # Train the model with the callback function
        self.model = lgb.train(params, 
                               train_data, 
                               num_boost_round=100, 
                               valid_sets=[train_data, val_data], 
                               verbose_eval=20, 
                               early_stopping_rounds=100,
                               callbacks=[lgb.callback.early_stopping(100), print_iteration_num])  # Callback to print iteration number
        print("Training finished.")
        
    def optimize_hyperparams(self):

        def objective(trial):
            params = {
                'objective': 'multiclass',
                'num_class': len(np.unique(self.y_train)),
                'verbosity': -1,
                'boosting_type': 'gbdt',
                'num_leaves': trial.suggest_int('num_leaves', 2, 50),
                'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e0),
                'feature_fraction': trial.suggest_float('feature_fraction', 0.1, 1.0),
                'bagging_fraction': trial.suggest_float('bagging_fraction', 0.1, 1.0),
                'bagging_freq': trial.suggest_int('bagging_freq', 1, 10),
                'num_threads': -1
            }

            if 'max_depth' in params:
                del params['max_depth']  # LightGBM doesn't use max_depth, it uses num_leaves

            train_data = lgb.Dataset(self.X_train, label=self.y_train, categorical_feature=self.cat_features)
            val_data = lgb.Dataset(self.X_val, label=self.y_val, categorical_feature=self.cat_features, reference=train_data)

            self.model = lgb.train(params, train_data, num_boost_round=1000, valid_sets=[train_data, val_data], verbose_eval=False, early_stopping_rounds=100)

            y_pred = self.model.predict(self.X_val)

            roc_auc = roc_auc_score(self.y_val, y_pred, multi_class='ovr')

            return roc_auc

        study = optuna.create_study(pruner=optuna.pruners.MedianPruner(n_warmup_steps=5), direction="maximize")
        study.optimize(objective, n_trials=10, timeout=60)

        self.best_params = study.best_params

        print("Best params:", self.best_params)

    def load_model(self, file_path):
        self.model = lgb.Booster(model_file=file_path)

    def save_model(self, file_path):
        self.model.save_model(file_path)

    def show_feats_imp(self):
        if self.model is None:
            raise ValueError("Model not found!")

        lgb.plot_importance(self.model, figsize=(15, 10), importance_type='split')
        plt.show()

        feature_importance = pd.DataFrame({
            'feature': self.model.feature_name(),
            'importance': self.model.feature_importance(importance_type='split')
        })
        self.top_features = feature_importance.sort_values(by='importance', ascending=False)['feature'].values
        print(self.top_features)

    def top_feats_selection(self):
        top = []
        roc_tr = []
        roc_val = []

        for col in tqdm(self.top_features):
            top.append(col)

            train_data = lgb.Dataset(self.X_train[top], label=self.y_train, categorical_feature=[feat for feat in self.cat_features if feat in top])
            val_data = lgb.Dataset(self.X_val[top], label=self.y_val, categorical_feature=[feat for feat in self.cat_features if feat in top], reference=train_data)

            self.model = lgb.train(self.params, train_data, num_boost_round=1000, valid_sets=[train_data, val_data], verbose_eval=False, early_stopping_rounds=100)

            y_train_pred = self.model.predict(self.X_train[top])
            y_val_pred = self.model.predict(self.X_val[top])

            roc_auc_tr = roc_auc_score(self.y_train, y_train_pred, multi_class='ovr')
            roc_auc_val = roc_auc_score(self.y_val, y_val_pred, multi_class='ovr')

            roc_tr.append(roc_auc_tr)
            roc_val.append(roc_auc_val)

        plt.figure(figsize=(15, 10))
        plt.plot(range(len(self.top_features)), roc_tr, marker='o', label='Train')
        plt.plot(range(len(self.top_features)), roc_val, marker='o', label='Valid')
        plt.xlabel("Number of Top Features")
        plt.ylabel("ROC AUC")
        plt.title("ROC AUC on Top-K Features")
        plt.legend()
        plt.show()

        stats = pd.DataFrame({
            "TRAIN": roc_tr,
            "VALID": roc_val
        })

        stats.to_excel("lgbm_top_features.xlsx", index=False)

    def one_factor_roc(self):
        story = pd.DataFrame()

        for feature in tqdm(self.X_train.columns):
            train_data = lgb.Dataset(self.X_train[[feature]], label=self.y_train, categorical_feature=[feature] if feature in self.cat_features else None)
            val_data = lgb.Dataset(self.X_val[[feature]], label=self.y_val, categorical_feature=[feature] if feature in self.cat_features else None, reference=train_data)

            self.model = lgb.train(self.params, train_data, num_boost_round=1000, valid_sets=[train_data, val_data], verbose_eval=False, early_stopping_rounds=100)

            y_train_pred = self.model.predict(self.X_train[[feature]])
            y_val_pred = self.model.predict(self.X_val[[feature]])

            roc_auc_tr = roc_auc_score(self.y_train, y_train_pred, multi_class='ovr')
            roc_auc_val = roc_auc_score(self.y_val, y_val_pred, multi_class='ovr')

            story = story.append(pd.DataFrame({
                'features': [feature],
                'train': [roc_auc_tr],
                'valid': [roc_auc_val]
            }), ignore_index=True)

        plt.figure(figsize=(10, 7))
        plt.bar(range(len(story['features'])), story['train'], align='center', label='Train')
        plt.bar(range(len(story['features'])), story['valid'], align='edge', label='Valid')
        plt.xlabel("Features")
        plt.ylabel("ROC-AUC")
        plt.title("One-Factor ROC-AUC")
        plt.xticks(range(len(story['features'])), story['features'], rotation=45)
        plt.legend()
        plt.tight_layout()
        story.to_excel("lgbm_one_factor_roc.xlsx", index=False)




In [2]:
train_df = pd.read_parquet("train_data.pqt")
test_df = pd.read_parquet("test_data.pqt")

заполянем 5 месяцем

In [3]:
test_df['start_cluster'] = test_df['start_cluster'].fillna(method='ffill')

вытягиваем в колбасу по id

In [4]:
grouped_df_first = test_df.groupby('id').first().reset_index()
merged_df = pd.merge(test_df, grouped_df_first, on='id', suffixes=('', '_first'))

grouped_df_second = test_df.groupby('id').nth(1).reset_index()
merged_df = pd.merge(merged_df, grouped_df_second, on='id', suffixes=('', '_second'))

merged_df = merged_df.loc[:, ~merged_df.columns.duplicated()]

In [5]:
test_df = merged_df[merged_df['date'] == 'month_6']

In [6]:
grouped_df_first = train_df.groupby('id').first().reset_index()
merged_df = pd.merge(train_df, grouped_df_first, on='id', suffixes=('', '_first'))

grouped_df_second = train_df.groupby('id').nth(1).reset_index()
merged_df = pd.merge(merged_df, grouped_df_second, on='id', suffixes=('', '_second'))

merged_df = merged_df.loc[:, ~merged_df.columns.duplicated()]

In [7]:
train_df = merged_df[merged_df['date'] == 'month_3']

In [8]:
label_encoder = LabelEncoder()
train_df['start_cluster'] = label_encoder.fit_transform(train_df['start_cluster'])
train_df['end_cluster'] = label_encoder.transform(train_df['end_cluster'])

train_df['start_cluster_first'] = label_encoder.fit_transform(train_df['start_cluster_first'])
train_df['start_cluster_second'] = label_encoder.fit_transform(train_df['start_cluster_second'])

train_df['end_cluster_first'] = label_encoder.transform(train_df['end_cluster_first'])
train_df['end_cluster_second'] = label_encoder.transform(train_df['end_cluster_second'])

test_df['start_cluster'] = label_encoder.fit_transform(test_df['start_cluster'])
test_df['start_cluster_first'] = label_encoder.fit_transform(test_df['start_cluster_first'])
test_df['start_cluster_second'] = label_encoder.fit_transform(test_df['start_cluster_second'])

category_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))

In [9]:
category_mapping

{'{other}': 0,
 '{}': 1,
 '{α, β}': 2,
 '{α, γ}': 3,
 '{α, δ}': 4,
 '{α, ε, η}': 5,
 '{α, ε, θ}': 6,
 '{α, ε, ψ}': 7,
 '{α, ε}': 8,
 '{α, η}': 9,
 '{α, θ}': 10,
 '{α, λ}': 11,
 '{α, μ}': 12,
 '{α, π}': 13,
 '{α, ψ}': 14,
 '{α}': 15,
 '{λ}': 16}

веса для рок аука которые идут в бустинг

In [10]:
def weighted_roc_auc(y_true, y_pred, labels, weights_dict):
    unnorm_weights = np.array([weights_dict[label] for label in labels])
    weights = unnorm_weights / unnorm_weights.sum()
    classes_roc_auc = roc_auc_score(y_true, y_pred, labels=labels,
                                    multi_class="ovr", average=None)
    return sum(weights * classes_roc_auc)

In [11]:
cluster_weights = pd.read_excel("cluster_weights.xlsx").set_index("cluster")
weights_dict = cluster_weights["unnorm_weight"].to_dict()

In [12]:
weits = [0]*17

for k, v in category_mapping.items():
    weits[category_mapping[k]] = weights_dict[k]

In [13]:
weits

[3, 1, 3, 3, 3, 1, 1, 3, 2, 2, 1, 3, 2, 1, 3, 2, 2]

треним

In [14]:
cat_cols = [
    "channel_code", "city", "city_type",
    "okved", "segment", "start_cluster",
    "index_city_code", "ogrn_month", "ogrn_year",
]

# # Create a new list with modified values
modified_cat_cols = cat_cols + [item + '_first' for item in cat_cols] + [item + '_second' for item in cat_cols]

print(modified_cat_cols)

['channel_code', 'city', 'city_type', 'okved', 'segment', 'start_cluster', 'index_city_code', 'ogrn_month', 'ogrn_year', 'channel_code_first', 'city_first', 'city_type_first', 'okved_first', 'segment_first', 'start_cluster_first', 'index_city_code_first', 'ogrn_month_first', 'ogrn_year_first', 'channel_code_second', 'city_second', 'city_type_second', 'okved_second', 'segment_second', 'start_cluster_second', 'index_city_code_second', 'ogrn_month_second', 'ogrn_year_second']


In [15]:
cat_cols = modified_cat_cols

train_df[cat_cols] = train_df[cat_cols].astype("str")
test_df[cat_cols] = test_df[cat_cols].astype("str")

for cat in cat_cols:
    train_df[cat] = train_df[cat].fillna('missing')
    test_df[cat] = test_df[cat].fillna('missing')

train_df[cat_cols] = train_df[cat_cols].astype("str")
test_df[cat_cols] = test_df[cat_cols].astype("str")



train_df['start_cluster'] = train_df['start_cluster'].astype('str')
train_df['end_cluster'] = train_df['end_cluster'].astype('str')

cat_cols = [
    "channel_code", "city", "city_type",
    "okved", "segment", "start_cluster",
    "index_city_code", "ogrn_month", "ogrn_year",
    
]

train_df[cat_cols] = train_df[cat_cols].astype("str")
test_df[cat_cols] = test_df[cat_cols].astype("str")

for cat in cat_cols:
    train_df[cat] = train_df[cat].fillna('missing')
    test_df[cat] = test_df[cat].fillna('missing')
    
    
features = list(set(train_df.drop(["id", "date", "end_cluster"], axis=1).columns))

In [16]:
features = list(set(train_df.drop(["id", "date", "end_cluster", "end_cluster_first", "end_cluster_second", "date_first", "date_second"], axis=1).columns))

In [17]:
len(features)

270

In [18]:
cat_features = modified_cat_cols

In [19]:
numerical_features = list(set(features) - set(cat_features))

In [20]:
classes = list(train_df['end_cluster'].value_counts().reset_index()['index'])

In [21]:
X = train_df[features]

In [22]:
for col in cat_features:
    X[col] = pd.Categorical(X[col])

In [23]:
X[numerical_features] = X[numerical_features].astype('float')

In [24]:
y = train_df['end_cluster']
y = y.astype('category')

In [25]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [26]:
# Check the data type of y_train
print("Data type of y_train:", type(y_train))

# Convert y_train to numpy array if it's not already
if not isinstance(y_train, (np.ndarray, pd.Series)):
    y_train = np.array(y_train)

# Convert y_train to appropriate data type if needed
if y_train.dtype != 'int':
    y_train = y_train.astype('int')

Data type of y_train: <class 'pandas.core.series.Series'>


In [27]:
# Check the data type of y_train
print("Data type of y_train:", type(y_val))

# Convert y_train to numpy array if it's not already
if not isinstance(y_val, (np.ndarray, pd.Series)):
    y_val = np.array(y_val)

# Convert y_train to appropriate data type if needed
if y_val.dtype != 'int':
    y_val = y_val.astype('int')

Data type of y_train: <class 'pandas.core.series.Series'>


In [33]:
boosting_model = Boosting(X_train, X_val, y_train, y_val, cat_features)

Init...
Init Finished!


In [None]:
boosting_model.train()

Training...
Training with params: {'objective': 'multiclass', 'num_class': 17, 'metric': 'multi_logloss', 'verbosity': 10, 'boosting_type': 'gbdt', 'num_leaves': 31, 'learning_rate': 0.2, 'max_depth': 6, 'feature_fraction': 0.9, 'bagging_fraction': 0.8, 'bagging_freq': 5, 'num_threads': -1}
[LightGBM] [Debug] Dataset::GetMultiBinFromSparseFeatures: sparse rate 0.835568
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.158925
[LightGBM] [Debug] init for col-wise cost 3.795047 seconds, init for row-wise cost 3.609013 seconds
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Dense Multi-Val Bin
[LightGBM] [Info] Total Bins 57246
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 270
[LightGBM] [Info] Start training from score -2.488915
[LightGBM] [Info] Start training from score -1.570157
[LightGBM] [Info] Start training from score -3

In [46]:
test = test_df[test_df['date'] == 'month_6']

In [48]:
for col in cat_features:
    test[col] = pd.Categorical(test[col])

test[numerical_features] = test[numerical_features].astype('float')

In [51]:
ans = boosting_model.model.predict(test[features])

In [52]:
data = {} #{'id': test.id.to_list()}
for cls, prob in zip(category_mapping.keys(), ans):
    data[cls] = prob

column_mapping = {
    0: '{other}',
    1: '{}',
    2: '{α, β}',
    3: '{α, γ}',
    4: '{α, δ}',
    5: '{α, ε, η}',
    6: '{α, ε, θ}',
    7: '{α, ε, ψ}',
    8: '{α, ε}',
    9: '{α, η}',
    10: '{α, θ}',
    11: '{α, λ}',
    12: '{α, μ}',
    13: '{α, π}',
    14: '{α, ψ}',
    15: '{α}',
    16: '{λ}'
}

output = pd.DataFrame(ans)
output = output.rename(columns=column_mapping)
output = output.assign(id=test['id'].tolist())

sample_submission_df = pd.read_csv("sample_submission.csv")
output[list(sample_submission_df.columns)].to_csv('ans_lgb.csv', index=False)