# Baseline

In [1]:
import numpy as np
import pandas as pd
# from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import copy
import math
import pickle
from sklearn.metrics import roc_auc_score
import optuna
import catboost as cb
import warnings
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
warnings.filterwarnings('ignore')
from tqdm.auto import tqdm

In [2]:
class Boosting:

    def __init__(self, X_train, X_val, y_train, y_val, cat_features, params = None):
        print("Init...")
        self.X_train = X_train
        self.X_val = X_val
        self.y_train = y_train
        self.y_val = y_val
        self.cat_features = cat_features
        self.params = params
        self.model = None
        self.best_params = None
        self.top_features = None
        self.train_pool = cb.Pool(data = X_train, label = y_train, cat_features = cat_features)
        self.val_pool = cb.Pool(data = X_val, label = y_val, cat_features = cat_features)
        print("Init Finished!")

    def train(self):
        if self.params is None:
            model = cb.CatBoostClassifier(
                # learning_rate = 0.303,
                # depth = 6,
                # l2_leaf_reg = 2.437,
                # random_seed = 42,
                # min_data_in_leaf = 30,
                # one_hot_max_size = 40,
                # colsample_bylevel = 0.079,
                loss_function = 'MultiClass',
                task_type = 'CPU',
                # iterations = 1000,
                use_best_model = True,
                verbose = 100,
                thread_count = -1,
                early_stopping_rounds = 100,
                eval_metric = 'AUC',
                class_weights=weits,
                # boosting_type = 'Plain',
                # bootstrap_type = 'MVS'
            )
        else:
            self.params["verbose"] = 100
            self.params["iterations"] = 1000
            model = cb.CatBoostClassifier(**self.params)

        model.fit(
            self.train_pool,
            eval_set = self.val_pool
        )
        self.model = model
        # y_train_pred = model.predict_proba(self.X_train)[:, 1]
        # y_val_pred = model.predict_proba(self.X_val)[:, 1]

        # roc_auc_tr = roc_auc_score(self.y_train, y_train_pred)
        # roc_auc_val = roc_auc_score(self.y_val, y_val_pred)

        # print("ROC AUC на обучающей выборке:", roc_auc_tr)
        # print("ROC AUC на валидационной выборке:", roc_auc_val)


    def optimize_hyperparams(self):

        def objective(trial):
            params = {
                "objective" : trial.suggest_categorical("objective", ["MultiClass"]),
                "learning_rate" : trial.suggest_loguniform("learning_rate", 1e-5, 1e0),
                "l2_leaf_reg" : trial.suggest_loguniform("l2_leaf_reg", 1e-2, 3e0),
                "colsample_bylevel" : trial.suggest_float("colsample_bylevel", 0.01, 0.1, log = True),
                "depth" : trial.suggest_int("depth", 2, 5),
                "boosting_type" : trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
                "bootstrap_type" : trial.suggest_categorical("bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]),
                "min_data_in_leaf" : trial.suggest_int("min_data_in_leaf", 2, 50),
                "one_hot_max_size" : trial.suggest_int("one_hot_max_size", 2, 50),
                "iterations" : trial.suggest_int("iterations", 500, 3500),
                "eval_metric" : "AUC"
            }

            if params["bootstrap_type"] == "Bayesian":
                params["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
            elif params["bootstrap_type"] == "Bernoulli":
                params["subsample"] = trial.suggest_float("subsample", 0.1, 1, log = True)

            model = cb.CatBoostClassifier(
                loss_function = 'Logloss',
                random_seed = 42,
                task_type = 'CPU',
                use_best_model = True,
                verbose = False,
                **params
            )

            model.fit(
                self.train_pool,
                eval_set = self.val_pool
            )

            y_pred = model.predict_proba(self.X_val)[:, 1]

            roc_auc = roc_auc_score(self.y_val, y_pred)

            return roc_auc

        study = optuna.create_study(pruner=optuna.pruners.MedianPruner(n_warmup_steps = 5), direction = "maximize")
        study.optimize(objective, n_trials = 10, timeout = 60)

        self.best_params = study.best_params

        print("Best params:", self.best_params)

    def load_model(self, file_path):
        with open(file_path, "rb") as f:
            self.model = pickle.load(f)

    def save_model(self, file_path):
        with open(file_path, "wb") as f:
            pickle.dump(self.model, f)

    def show_feats_imp(self):
        if self.model is None:
            raise ValueError("Model not found!")

        feature_importance = self.model.feature_importances_
        sorted_idx = np.argsort(feature_importance)

        plt.figure(figsize=(15, 10))
        plt.barh(range(len(sorted_idx)), feature_importance[sorted_idx], align='center')
        plt.yticks(range(len(sorted_idx)), np.array(self.model.feature_names_)[sorted_idx])
        plt.title("Feature Importance")
        plt.show()

        self.top_features = np.flip(np.array(self.model.feature_names_)[sorted_idx])
        print(self.top_features)

    def top_feats_selection(self):

        top = []
        roc_tr = []
        roc_val = []

        for col in tqdm(self.top_features):

            top.append(col)
            top_cat = list(set(self.cat_features) & set(top))

            train_pool = cb.Pool(data = self.X_train[top], label = self.y_train, cat_features = top_cat)
            val_pool = cb.Pool(data = self.X_val[top], label = self.y_val, cat_features = top_cat)

            if self.params is None:
                model = cb.CatBoostClassifier(
                    learning_rate = 0.303,
                    depth = 6,
                    l2_leaf_reg = 2.437,
                    random_seed = 42,
                    min_data_in_leaf = 30,
                    one_hot_max_size = 40,
                    colsample_bylevel = 0.079,
                    loss_function = 'MultiClass',
                    task_type = 'CPU',
                    iterations = 1000,
                    use_best_model = True,
                    verbose = 100,
                    thread_count = -1,
                    early_stopping_rounds = 100,
                    eval_metric = 'AUC',
                    class_weights=weits,
                    boosting_type = 'Plain',
                    bootstrap_type = 'MVS'
                )
                path = "no_optuna_top_features.xlsx"
            else:
                self.params["verbose"] = 0
                self.params["iterations"] = 500
                path = "optuna_top_features.xlsx"
                model = cb.CatBoostClassifier(**self.params)

            model.fit(
                train_pool,
                eval_set = val_pool
            )

            y_train_pred = model.predict_proba(self.X_train[top])[:, 1]
            y_val_pred = model.predict_proba(self.X_val[top])[:, 1]

            roc_auc_tr = roc_auc_score(self.y_train, y_train_pred)
            roc_auc_val = roc_auc_score(self.y_val, y_val_pred)

            roc_tr.append(roc_auc_tr)
            roc_val.append(roc_auc_val)

        plt.figure(figsize=(15, 10))
        plt.plot(range(len(self.top_features)), roc_tr, marker = 'o', label = 'Train')
        plt.plot(range(len(self.top_features)), roc_val, marker = 'o', label = 'Valid')
        plt.xlabel("Number of Top Features")
        plt.ylabel("ROC AUC")
        plt.title("ROC AUC on Top-K Features")
        plt.legend()
        plt.show()

        stats = pd.DataFrame({
            "TRAIN" : roc_tr,
            "VALID" : roc_val
        })

        stats.to_excel(path, index = False)

    def one_factor_roc(self):
        story = pd.DataFrame()

        for feature in tqdm(self.X_train.columns):
            if self.params is None:
                model = cb.CatBoostClassifier(
                    learning_rate = 0.303,
                    depth = 6,
                    l2_leaf_reg = 2.437,
                    random_seed = 42,
                    min_data_in_leaf = 30,
                    one_hot_max_size = 40,
                    colsample_bylevel = 0.079,
                    loss_function = 'MultiClass',
                    task_type = 'CPU',
                    iterations = 1000,
                    use_best_model = True,
                    verbose = 100,
                    thread_count = -1,
                    early_stopping_rounds = 100,
                    eval_metric = 'AUC',
                    class_weights=weits,
                    boosting_type = 'Plain',
                    bootstrap_type = 'MVS'
                )
                path = "no_optuna_one_factor_roc.xlsx"
            else:
                self.params["verbose"] = False
                self.params["iterations"] = 500
                path = "optuna_one_factor_roc.xlsx"
                model = cb.CatBoostClassifier(**self.params)

            if feature in self.cat_features:
                train_pool = cb.Pool(data = self.X_train[[feature]], label = self.y_train, cat_features = [feature])
                val_pool = cb.Pool(data = self.X_val[[feature]], label = self.y_val, cat_features = [feature])
            else:
                train_pool = cb.Pool(data = self.X_train[[feature]], label = self.y_train)
                val_pool = cb.Pool(data = self.X_val[[feature]], label = self.y_val)

            model.fit(
                train_pool,
                eval_set = val_pool
            )

            y_train_pred = model.predict_proba(self.X_train[[feature]])[:, 1]
            y_val_pred = model.predict_proba(self.X_val[[feature]])[:, 1]

            roc_auc_tr = roc_auc_score(self.y_train, y_train_pred)
            roc_auc_val = roc_auc_score(self.y_val, y_val_pred)

            story = story.append(pd.DataFrame({
                'features' : [feature],
                'train' : [roc_auc_tr],
                'valid' : [roc_auc_val]
            }), ignore_index = True)

        plt.figure(figsize=(10, 7))
        plt.bar(range(len(story['features'])), story['train'], align = 'center', label = 'Train')
        plt.bar(range(len(story['features'])), story['valid'], align = 'edge', label = 'Valid')
        plt.xlabel("Features")
        plt.ylabel("ROC-AUC")
        plt.title("One-Factor ROC-AUC")
        plt.xticks(range(len(story['features'])), story['features'], rotation = 45)
        plt.legend()
        plt.tight_layout()
        story.to_excel(path, index = False)

In [47]:
# !pip freeze | grep "numpy\|pandas\|lightgbm\|scikit-learn"

## Загрузка данных

In [3]:
train_df = pd.read_parquet("train_data.pqt")
test_df = pd.read_parquet("test_df.pqt")

заполянем 5 месяцем

In [4]:
test_df['start_cluster'] = test_df['start_cluster'].fillna(method='ffill')

вытягиваем в колбасу по id

In [5]:
grouped_df_first = test_df.groupby('id').first().reset_index()
merged_df = pd.merge(test_df, grouped_df_first, on='id', suffixes=('', '_first'))

grouped_df_second = test_df.groupby('id').nth(1).reset_index()
merged_df = pd.merge(merged_df, grouped_df_second, on='id', suffixes=('', '_second'))

merged_df = merged_df.loc[:, ~merged_df.columns.duplicated()]

In [6]:
test_df = merged_df[merged_df['date'] == 'month_6']

In [7]:
grouped_df_first = train_df.groupby('id').first().reset_index()
merged_df = pd.merge(train_df, grouped_df_first, on='id', suffixes=('', '_first'))

grouped_df_second = train_df.groupby('id').nth(1).reset_index()
merged_df = pd.merge(merged_df, grouped_df_second, on='id', suffixes=('', '_second'))

merged_df = merged_df.loc[:, ~merged_df.columns.duplicated()]

In [1]:
train_df = merged_df[merged_df['date'] == 'month_3']

NameError: name 'merged_df' is not defined

удаляем сильную корреляцию

In [9]:
corr_matrix = train_df.corr().abs()

# Получение верхнего треугольника матрицы корреляции (без диагонали)
upper_triangle = corr_matrix.where(
    np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

# Нахождение колонок, где корреляция больше 0.9
to_drop = [column for column in upper_triangle.columns if any(upper_triangle[column] > 0.95)]

train_df =train_df.drop(to_drop, axis=1)


In [10]:
test_df =test_df.drop(to_drop, axis=1)

In [11]:
category_columns = ['start_cluster', 'channel_code', 'city', 'city_type', 
                    'index_city_code', 'ogrn_month', 'ogrn_year', 'okved', 'segment']

In [12]:
# counts = train_df['city_type'].value_counts()

# # Построение гистограммы
# plt.bar(counts.index, counts.values)
# plt.xlabel('Уникальные значения')
# plt.ylabel('Частота')
# plt.title('Гистограмма количества встречающихся раз категориальных значений')
# plt.show()

обработка текстов и нан

In [12]:
def clever_one_hot(df, col): #переписать в lable encoding?
    top_4 = df[col].value_counts().index[:4]
    df.loc[~df[col].isin(top_4), col] = 'other'
    one_hot_encoded = pd.get_dummies(df[col], prefix=col)
    df = df.drop(col, axis=1)
    return pd.concat([df, one_hot_encoded], axis=1)

def feature_prossesing(df):
    one_hot_encoded = pd.get_dummies(df['segment'], prefix='seg')
    df = df.drop('segment', axis=1)
    df = pd.concat([df, one_hot_encoded], axis=1)

    df['ogrn_year'] = df['ogrn_year'].fillna('_-1')
    df['ogrn_month'] = df['ogrn_month'].fillna('_-1')
    df['ogrn_year'] = df['ogrn_year'].apply(lambda x: int(x.split('_')[-1]))
    df['ogrn_month'] = df['ogrn_month'].apply(lambda x: int(x.split('_')[-1]))
    df['lasting'] = df['ogrn_year']*12 + df['ogrn_month']
    df['lasting'] = df['lasting'].apply(lambda x: x if x>=0 else -1)

    del df['ogrn_year']
    del df['ogrn_month']

    for i in ['channel_code', 'city_type']:
        df = clever_one_hot(df, i)

    label_encoder = LabelEncoder()
    df['start_cluster'] = label_encoder.fit_transform(df['start_cluster'])
    df['end_cluster'] = label_encoder.transform(df['end_cluster'])
    category_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))

    df = df.drop(df.select_dtypes(include=['object']).columns, axis=1)
    df = df.fillna(-1)

    return df, category_mapping   

In [13]:
df, category_mapping = feature_prossesing(train_df)

In [14]:
def clever_one_hot(df, col, df1): #переписать в lable encoding?
    top_4 = df1[col].value_counts().index[:4]
    df.loc[~df[col].isin(top_4), col] = 'other'
    one_hot_encoded = pd.get_dummies(df[col], prefix=col)
    df = df.drop(col, axis=1)
    return pd.concat([df, one_hot_encoded], axis=1)

def feature_prossesing(df, df_1):
    one_hot_encoded = pd.get_dummies(df['segment'], prefix='seg')
    df = df.drop('segment', axis=1)
    df = pd.concat([df, one_hot_encoded], axis=1)

    df['ogrn_year'] = df['ogrn_year'].fillna('_-1')
    df['ogrn_month'] = df['ogrn_month'].fillna('_-1')
    df['ogrn_year'] = df['ogrn_year'].apply(lambda x: int(x.split('_')[-1]))
    df['ogrn_month'] = df['ogrn_month'].apply(lambda x: int(x.split('_')[-1]))
    df['lasting'] = df['ogrn_year']*12 + df['ogrn_month']
    df['lasting'] = df['lasting'].apply(lambda x: x if x>=0 else -1)

    del df['ogrn_year']
    del df['ogrn_month']

    for i in ['channel_code', 'city_type']:
        df = clever_one_hot(df, i, df_1)

    label_encoder = LabelEncoder()
    df['start_cluster'] = label_encoder.fit_transform(df['start_cluster'])
    #df['end_cluster'] = label_encoder.transform(df['end_cluster'])
    category_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))

    df = df.drop(df.select_dtypes(include=['object']).columns, axis=1)
    df = df.fillna(-1)

    return df, category_mapping   

In [15]:
test, category_mapping = feature_prossesing(test_df, train_df)

In [16]:
category_mapping

{'{other}': 0,
 '{}': 1,
 '{α, β}': 2,
 '{α, γ}': 3,
 '{α, δ}': 4,
 '{α, ε, η}': 5,
 '{α, ε, θ}': 6,
 '{α, ε, ψ}': 7,
 '{α, ε}': 8,
 '{α, η}': 9,
 '{α, θ}': 10,
 '{α, λ}': 11,
 '{α, μ}': 12,
 '{α, ψ}': 13,
 '{α}': 14,
 '{λ}': 15}

веса для рок аука которые идут в бустинг

In [17]:
def weighted_roc_auc(y_true, y_pred, labels, weights_dict):
    unnorm_weights = np.array([weights_dict[label] for label in labels])
    weights = unnorm_weights / unnorm_weights.sum()
    classes_roc_auc = roc_auc_score(y_true, y_pred, labels=labels,
                                    multi_class="ovr", average=None)
    return sum(weights * classes_roc_auc)

In [18]:
cluster_weights = pd.read_excel("cluster_weights.xlsx").set_index("cluster")
weights_dict = cluster_weights["unnorm_weight"].to_dict()

In [19]:
weits = [0]*17

for k, v in category_mapping.items():
    weits[category_mapping[k]] = weights_dict[k]

In [20]:
weits

[3, 1, 3, 3, 3, 1, 1, 3, 2, 2, 1, 3, 2, 3, 2, 2, 0]

треним

In [21]:
y = df['end_cluster']
X = df.drop(['end_cluster'], axis=1)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=42)

In [22]:
boosting = Boosting(X_train, X_val, y_train, y_val, cat_features = [])


Init...
Init Finished!


In [47]:
boosting.train()

Learning rate set to 0.119794
0:	test: 0.8082294	best: 0.8082294 (0)	total: 2.09s	remaining: 34m 47s
100:	test: 0.9312565	best: 0.9332134 (94)	total: 6m 18s	remaining: 56m 9s
200:	test: 0.9329724	best: 0.9338155 (118)	total: 11m 58s	remaining: 47m 34s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.9338154824
bestIteration = 118

Shrink model to first 119 iterations.


сохраняем загружаем

In [48]:
boosting.save_model('model2.pkl')

In [23]:
boosting.load_model('model2.pkl')

получаем ответ на тесте

In [24]:
ans = boosting.model.predict_proba(test)

In [25]:
data = {} #{'id': test.id.to_list()}
for cls, prob in zip(category_mapping.keys(), ans):
    data[cls] = prob

column_mapping = {
    0: '{other}',
    1: '{}',
    2: '{α, β}',
    3: '{α, γ}',
    4: '{α, δ}',
    5: '{α, ε, η}',
    6: '{α, ε, θ}',
    7: '{α, ε, ψ}',
    8: '{α, ε}',
    9: '{α, η}',
    10: '{α, θ}',
    11: '{α, λ}',
    12: '{α, μ}',
    13: '{α, π}',
    14: '{α, ψ}',
    15: '{α}',
    16: '{λ}'
}

output = pd.DataFrame(ans)
output = output.rename(columns=column_mapping)
output = output.assign(id=test['id'].tolist())

sample_submission_df = pd.read_csv("sample_submission.csv")
output[list(sample_submission_df.columns)].to_csv('ans3.csv', index=False)


In [5]:
import pandas as pd
sample_submission_df = pd.read_csv("sample_submission.csv")
list(sample_submission_df.columns)

['id',
 '{other}',
 '{}',
 '{α, β}',
 '{α, γ}',
 '{α, δ}',
 '{α, ε, η}',
 '{α, ε, θ}',
 '{α, ε, ψ}',
 '{α, ε}',
 '{α, η}',
 '{α, θ}',
 '{α, λ}',
 '{α, μ}',
 '{α, π}',
 '{α, ψ}',
 '{α}',
 '{λ}']

In [1]:
import hashlib

def generate_md5(file_path):
    # Open the file in binary mode
    with open(file_path, "rb") as file:
        # Read the content of the file
        content = file.read()
        # Generate the MD5 hash of the content
        md5_hash = hashlib.md5(content).hexdigest()
        return md5_hash

# Call the function to generate the MD5 hash of a video file
# Replace "video.mp4" with the path to your video file
md5_hash = generate_md5(r"C:\Users\alina\Videos\2024-03-12 14-31-10.mkv")

# Print the MD5 hash
print(md5_hash)


e4ead55ff0a49c8ee3cc879f2470f4d9
