# Baseline

In [177]:
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import StratifiedKFold
from sklearn.utils.class_weight import compute_class_weight
import random
from sklearn.model_selection import RepeatedStratifiedKFold


def weighted_roc_auc(y_true, y_pred, labels, weights_dict):
    unnorm_weights = np.array([weights_dict[label] for label in labels])
    weights = unnorm_weights / unnorm_weights.sum()
    classes_roc_auc = roc_auc_score(y_true, y_pred, labels=labels,
                                    multi_class="ovr", average=None)
    return sum(weights * classes_roc_auc)

In [2]:
train_df = pd.read_parquet("data/train_data.pqt")
test_df = pd.read_parquet("data/test_data.pqt")

In [3]:
cat_cols = [
    "channel_code", "city", "city_type",
    "okved", "segment", "start_cluster",
    "index_city_code", "ogrn_month", "ogrn_year",
]

In [4]:
train_df[cat_cols] = train_df[cat_cols].astype("category")
test_df[cat_cols] = test_df[cat_cols].astype("category")

In [5]:
m = train_df.columns[train_df.dtypes == "category"]
train_df[m] = train_df[m].astype("str")

In [6]:
X = train_df.drop(["id", "date", "end_cluster"], axis=1)
X = X.fillna("#N/A")
y = train_df["end_cluster"]

In [7]:
train_df.drop("date", inplace=True, axis=1)

In [8]:
#for let in ["a", "b", "c", "deb_d", "deb_e", "deb_f", "deb_g", "deb_h"]:
#    train_df[f"{let}_ration_1m"] = train_df[f"sum_{let}_oper_1m"] / train_df[f"cnt_{let}_oper_1m"]
#    train_df[f"{let}_ration_3m"] = train_df[f"sum_{let}_oper_3m"] / train_df[f"cnt_{let}_oper_3m"]

In [9]:
from tqdm import tqdm
xs = []
ys = []

for idx, group in tqdm(train_df.groupby("id")):
    answer = group.end_cluster.tolist()[-1]
    group.drop("end_cluster", axis=1, inplace=True)
    
    features = pd.DataFrame(group.stack(dropna=False)).reset_index(drop=True, level=0).transpose()
    features = features.drop("id", axis=1)
    
    column_names=  features.columns.tolist()
    cnt_one = features.shape[1] //  3
    new_column_names = [str(ind//cnt_one) + "_" + column_name for ind, column_name in enumerate(column_names)]
    features.columns = new_column_names
    
    features = features.drop("2_start_cluster", axis=1)
        
    xs.append(features)
    ys.append(answer)

100%|████████████████████████████████████████████████████████████████████████| 200000/200000 [03:17<00:00, 1011.27it/s]


In [10]:
import gc
gc.collect()

0

In [11]:
xs_two_stages = []
ys_two_stages = []

for idx, group in tqdm(train_df.groupby("id")):
    answer = group.end_cluster.tolist()[-1]
    group.drop("end_cluster", axis=1, inplace=True)
    group = group.iloc[:2]
    features = pd.DataFrame(group.stack(dropna=False)).reset_index(drop=True, level=0).transpose()
    features = features.drop("id", axis=1)
    
    column_names = features.columns.tolist()
    cnt_one = features.shape[1] //  2
    new_column_names = [str(ind//cnt_one) + "_" + column_name for ind, column_name in enumerate(column_names)]
    features.columns = new_column_names
    
    features = features.drop("1_start_cluster", axis=1)
        
    xs_two_stages.append(features)
    ys_two_stages.append(answer)
    

100%|████████████████████████████████████████████████████████████████████████| 200000/200000 [03:19<00:00, 1003.27it/s]


## треним модельку на трех эатпах

In [194]:
X = pd.concat(xs)
y = ys.copy()


cat_cols = [
    "channel_code", "city", "city_type",
    "okved", "segment", "start_cluster",
    "index_city_code", "ogrn_month", "ogrn_year",
]
all_cat_cols = []

for i in range(3):
    for j in cat_cols:
        all_cat_cols.append(f"{i}_{j}")
all_cat_cols.remove("2_start_cluster")

In [197]:
cluster_weights = pd.read_excel("data/cluster_weights.xlsx").set_index("cluster")
weights_dict = cluster_weights["unnorm_weight"].to_dict()

In [198]:
y = np.array(y)

In [199]:
balanced_weights = compute_class_weight(class_weight="balanced", classes=np.unique(y), y=y)

In [200]:
import pickle
best_features = pickle.load(open("best_features.pkl", "rb"))

In [201]:
# cur_X = X[best_features]

In [202]:
# all_cat_cols = [i for i in all_cat_cols if i in best_features]

In [203]:
using_weights_dict = {}
csw = compute_class_weight(class_weight="balanced", classes=np.unique(y), y=y)

gamma = 1

for ind, i in enumerate(np.unique(y)):
    using_weights_dict[i] = weights_dict[i] * gamma + (1 - gamma) * csw[ind]

In [204]:
all_features = X.columns.tolist()

In [205]:
n_splits = 5
train_scores = []
val_scores = []
models = []
features_for_k_fold = []
cat_features_for_k_fold = []


# rmskf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
rmskf = RepeatedStratifiedKFold(n_splits=n_splits,  n_repeats=2, random_state=42)

for fold, (train_index, test_index) in enumerate(rmskf.split(X, y)):
    print(f'Fold: {fold}')
    random.shuffle(best_features)
    random.shuffle(all_features)

    COUNT_ONE_FOLD_FEATURES = 75
    cur_features = list(set(best_features[:COUNT_ONE_FOLD_FEATURES] + all_features[:200-COUNT_ONE_FOLD_FEATURES]))  
    
    cur_X = X.loc[:, cur_features].copy()
    
    features_for_k_fold.append(cur_features.copy())
    
    cur_cat_features = [i for i in all_cat_cols if i in features_for_k_fold[-1]]
    cat_features_for_k_fold.append(cur_cat_features.copy())

    X_train, X_val = cur_X.iloc[train_index], cur_X.iloc[test_index]
    y_train, y_val = y[train_index], y[test_index]
    
    train_pool = Pool(X_train, y_train, cat_features=cur_cat_features)
    valid_pool = Pool(X_val, y_val, cat_features=cur_cat_features)
    
    #gamma = fold / n_splits
    #w = np.array(list(weights_dict.values())) * (1 - gamma)
    #w += balanced_weights * (gamma)

    model = CatBoostClassifier(
        random_state=42,
        task_type="GPU",
        early_stopping_rounds = 50,
        class_weights=list(weights_dict.values()))
    
    model.fit(train_pool, eval_set=valid_pool, verbose=False)


    train_score = weighted_roc_auc(y_train, model.predict_proba(train_pool), model.classes_, using_weights_dict)
    valid_score = weighted_roc_auc(y_val, model.predict_proba(valid_pool), model.classes_, using_weights_dict)
    print("train score:", train_score)
    print("valid score:", valid_score)
    
    train_scores.append(train_score)
    val_scores.append(valid_score)
    models.append(model)

Fold: 0
train score: 0.9489340427971609
valid score: 0.9059745571287048
Fold: 1
train score: 0.9473771781498808
valid score: 0.8991948178826965
Fold: 2
train score: 0.9528863365650948
valid score: 0.9038675879329557
Fold: 3
train score: 0.9536108667805825
valid score: 0.8943353713362943
Fold: 4
train score: 0.9534173811975657
valid score: 0.8996862398655922
Fold: 5
train score: 0.9492160976756175
valid score: 0.8965030654444143
Fold: 6
train score: 0.9509609470719557
valid score: 0.9106365602896371
Fold: 7
train score: 0.9492211854257568
valid score: 0.9045872730838098
Fold: 8
train score: 0.9492959277506978
valid score: 0.8971698394983952
Fold: 9
train score: 0.9467150099991715
valid score: 0.8985726885925811


In [98]:
#train_pool = Pool(x_train, y_train, cat_features=all_cat_cols)
#valid_pool = Pool(x_val, y_val, cat_features=all_cat_cols)

#model = CatBoostClassifier(random_state=42,
#                           task_type="GPU",
#                           early_stopping_rounds = 150,
#                           class_weights=list(weights_dict.values()))
#model.fit(train_pool, eval_set=valid_pool)

In [95]:
#weighted_roc_auc(y_val, model.predict_proba(valid_pool), model.classes_, weights_dict)

## треним модельку на двух этапах

In [41]:
X = pd.concat(xs_two_stages)

y = ys_two_stages.copy()
x_train, x_val, y_train, y_val = train_test_split(X, y,
                                                  test_size=0.2,
                                                  random_state=42, 
                                                  stratify=y)


cat_cols = [
    "channel_code", "city", "city_type",
    "okved", "segment", "start_cluster",
    "index_city_code", "ogrn_month", "ogrn_year",
]
all_cat_cols2 = []

for i in range(2):
    for j in cat_cols:
        all_cat_cols2.append(f"{i}_{j}")

all_cat_cols2.remove("1_start_cluster")

In [42]:
train_pool = Pool(x_train, y_train, cat_features=all_cat_cols2)
valid_pool = Pool(x_val, y_val, cat_features=all_cat_cols2)

model2 = CatBoostClassifier(random_state=42,
                           task_type="GPU", 
                           class_weights=list(weights_dict.values()), # [weights_dict[i] for i in model.classes_], 
                           early_stopping_rounds = 50)
model2.fit(train_pool, eval_set=valid_pool)

Learning rate set to 0.175569
0:	learn: 1.3674723	test: 1.3686691	best: 1.3686691 (0)	total: 31ms	remaining: 30.9s
1:	learn: 1.1952516	test: 1.1958936	best: 1.1958936 (1)	total: 62.9ms	remaining: 31.4s
2:	learn: 1.0806240	test: 1.0808162	best: 1.0808162 (2)	total: 94.2ms	remaining: 31.3s
3:	learn: 1.0031998	test: 1.0034104	best: 1.0034104 (3)	total: 126ms	remaining: 31.3s
4:	learn: 0.9421761	test: 0.9427464	best: 0.9427464 (4)	total: 161ms	remaining: 32s
5:	learn: 0.8965048	test: 0.8972166	best: 0.8972166 (5)	total: 191ms	remaining: 31.6s
6:	learn: 0.8606656	test: 0.8616737	best: 0.8616737 (6)	total: 223ms	remaining: 31.6s
7:	learn: 0.8316112	test: 0.8330654	best: 0.8330654 (7)	total: 255ms	remaining: 31.6s
8:	learn: 0.8072425	test: 0.8090235	best: 0.8090235 (8)	total: 290ms	remaining: 31.9s
9:	learn: 0.7884613	test: 0.7909023	best: 0.7909023 (9)	total: 325ms	remaining: 32.2s
10:	learn: 0.7729905	test: 0.7759815	best: 0.7759815 (10)	total: 360ms	remaining: 32.4s
11:	learn: 0.7591956	te

<catboost.core.CatBoostClassifier at 0x19550e4e7d0>

377:	learn: 0.5992447	test: 0.6691476	best: 0.6689762 (367)	total: 13s	remaining: 21.3s
378:	learn: 0.5991101	test: 0.6691555	best: 0.6689762 (367)	total: 13s	remaining: 21.3s
379:	learn: 0.5989897	test: 0.6691517	best: 0.6689762 (367)	total: 13s	remaining: 21.2s
380:	learn: 0.5987595	test: 0.6690241	best: 0.6689762 (367)	total: 13.1s	remaining: 21.2s
381:	learn: 0.5983691	test: 0.6690089	best: 0.6689762 (367)	total: 13.1s	remaining: 21.2s
382:	learn: 0.5982209	test: 0.6689830	best: 0.6689762 (367)	total: 13.1s	remaining: 21.2s
383:	learn: 0.5981613	test: 0.6689691	best: 0.6689691 (383)	total: 13.2s	remaining: 21.1s
384:	learn: 0.5979200	test: 0.6689643	best: 0.6689643 (384)	total: 13.2s	remaining: 21.1s
385:	learn: 0.5976726	test: 0.6690157	best: 0.6689643 (384)	total: 13.2s	remaining: 21s
386:	learn: 0.5973080	test: 0.6689474	best: 0.6689474 (386)	total: 13.3s	remaining: 21s
387:	learn: 0.5971283	test: 0.6689120	best: 0.6689120 (387)	total: 13.3s	remaining: 21s
388:	learn: 0.5968511	

<catboost.core.CatBoostClassifier at 0x7f8a0bfb1b20>

## Прогноз на тестовой выборке

In [43]:
test_df = pd.read_parquet("data/test_data.pqt")
test_df = test_df.fillna("#N/A")

In [44]:
# test_df.pivot(index="id", columns="date", values="start_cluster").head(3)

In [45]:
# test_df["start_cluster"] = train_df["start_cluster"].mode()[0]
# test_df["start_cluster"] = test_df["start_cluster"].astype("category")

In [46]:
sample_submission_df = pd.read_csv("data/sample_submission.csv")

In [47]:
sample_submission_df.head()

Unnamed: 0,id,{other},{},"{α, β}","{α, γ}","{α, δ}","{α, ε, η}","{α, ε, θ}","{α, ε, ψ}","{α, ε}","{α, η}","{α, θ}","{α, λ}","{α, μ}","{α, π}","{α, ψ}",{α},{λ}
0,200000,0.2,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05
1,200001,0.2,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05
2,200002,0.2,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05
3,200003,0.2,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05
4,200004,0.2,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05


In [48]:
from tqdm import tqdm

dct_month = {"month_4": "month_1", "month_5":"month_2", "month_6":"month_3"}

xs_test = []
test_answers = []

idx_for_predict = []
data_for_predict = []

idx_for_predict_2 = []
data_for_predict_2 = []


cnt_months = []
for idx, group in tqdm(test_df.groupby("id")):
    #     group["date"] = group.date.map(lambda x: dct_month[x])    
    group.drop("date", axis=1, inplace=True)
    features = pd.DataFrame(group.stack(dropna=False)).reset_index(drop=True, level=0).transpose()
    
    #     features = features.drop("end_cluster", axis=1)

    cnt_months.append(features.shape[1])
    features = features.drop("id", axis=1)

    
    if features.shape[1] == 180:
        column_names = features.columns.tolist()
        cnt_one = features.shape[1] // 2
        new_column_names = [str(ind//cnt_one) + "_" + column_name for ind, column_name in enumerate(column_names)]
        features.columns = new_column_names
        
        features = features.drop("1_start_cluster", axis=1)

        data_for_predict_2.append(features)
        idx_for_predict_2.append(idx)
        continue

        
    column_names = features.columns.tolist()
    cnt_one = features.shape[1] //  3
    new_column_names = [str(ind//cnt_one) + "_" + column_name for ind, column_name in enumerate(column_names)]
    features.columns = new_column_names
    
    features = features.drop("2_start_cluster", axis=1)
    
    data_for_predict.append(features)
    idx_for_predict.append(idx)
    
    #     test_pool = Pool(features, cat_features=all_cat_cols)
    #     preds = model.predict_proba(features)
    #     test_answers.append(list(preds))

100%|█████████████████████████████████████████████████████████████████████████| 100000/100000 [02:50<00:00, 585.06it/s]


In [116]:
df_predict = pd.concat(data_for_predict)
#df_predict.drop(drop_features, axis=1, inplace=True)

In [186]:
# df_predict = df_predict[best_features]

In [217]:
pools = []

for features_for_pool, cat_features_for_pool in tqdm(zip(features_for_k_fold, cat_features_for_k_fold)):
    cur_df_predict = df_predict[features_for_pool]
    test_pool = Pool(cur_df_predict, cat_features=cat_features_for_pool)
    pools.append(test_pool)

10it [00:14,  1.41s/it]


In [218]:
pools = [pools[i] for i in [0, 2, 4, 5, 6, 7]]
models = [models[i] for i in [0, 2, 4, 5, 6, 7]]

In [219]:
# test_pool = Pool(df_predict, cat_features=all_cat_cols)
preds = np.stack([model.predict_proba(cur_pool) for model, cur_pool in zip(models, pools)]).mean(axis=0)
result_preds = pd.DataFrame(preds)
result_preds.columns = model.classes_
result_preds["id"] = idx_for_predict

In [220]:
df_predict2 = pd.concat(data_for_predict_2)
test_pool2 = Pool(df_predict2, cat_features=all_cat_cols2)
preds2 = model2.predict_proba(test_pool2)
result_preds2 = pd.DataFrame(preds2)
result_preds2.columns = model2.classes_
result_preds2["id"] = idx_for_predict_2

In [221]:
full_answer = pd.concat([result_preds, result_preds2]).sort_values("id")

In [222]:
full_answer.to_csv("da_zdravstvyet_sankt_piterburg_i_eto_gorod_nash.csv", index=False)

In [None]:
last_sub = pd.read_csv("baseline_submission.csv")

In [None]:
all_idxs = set(last_sub.id)
have_idxs = set(result_preds.id)
need_idxs = all_idxs - have_idxs

In [None]:
extra_df = last_sub[last_sub.id.isin(need_idxs)]

In [None]:
answer = pd.concat([result_preds, extra_df], axis=0).sort_values("id")

In [None]:
answer.to_csv("new_idea_sub.csv", index=False)