In [1]:
import warnings
warnings.simplefilter('ignore')

import gc
import re

import numpy as np
import pandas as pd
pd.set_option('max_columns', None)
pd.set_option('max_rows', 500)
pd.set_option('max_colwidth', 200)
from tqdm.notebook import tqdm

import nltk
from nltk.tokenize import word_tokenize

from gensim.models.word2vec import Word2Vec

from sklearn.utils.class_weight import compute_class_weight
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import KFold, GroupKFold, StratifiedKFold

from catboost import CatBoostClassifier

# loading data

In [2]:
sel_data = pd.read_csv('data/preliminary_sel_log_dataset.csv')
sel_data2 = pd.read_csv('data/preliminary_sel_log_dataset_a.csv')
sel_data = pd.concat([sel_data, sel_data2])
sel_data['time'] = pd.to_datetime(sel_data['time'])
sel_data.sort_values(by=['sn', 'time'], inplace=True)
sel_data.reset_index(drop=True, inplace=True)

print(sel_data.shape)
sel_data.head(10)

(493527, 4)


Unnamed: 0,sn,time,msg,server_model
0,000d33b21436,2020-09-02 11:38:40,System Boot Initiated BIOS_Boot_Up | Initiated by warm reset | Asserted,SM40
1,000d33b21436,2020-09-02 15:46:23,System Boot Initiated BIOS_Boot_Up | Initiated by power up | Asserted,SM40
2,005c5a9218ba,2020-06-28 18:26:14,Memory Memory_Status | Correctable ECC | Asserted,SM99
3,005c5a9218ba,2020-06-28 18:26:15,Memory Memory_Status | Correctable ECC | Asserted,SM99
4,005c5a9218ba,2020-06-28 18:26:20,Memory Memory_Status | Correctable ECC | Asserted,SM99
5,005c5a9218ba,2020-06-28 18:26:25,Memory Memory_Status | Correctable ECC | Asserted,SM99
6,005c5a9218ba,2020-06-28 18:26:26,Memory Memory_Status | Correctable ECC | Asserted,SM99
7,005c5a9218ba,2020-06-28 18:26:30,Memory Memory_Status | Correctable ECC | Asserted,SM99
8,005c5a9218ba,2020-06-28 18:38:49,System ACPI Power State #0x7d | S4/S5: soft-off | Asserted,SM99
9,005c5a9218ba,2020-06-28 18:40:26,System ACPI Power State #0x7d | S0/G0: working | Asserted,SM99


In [3]:
# sel_data['msg'] = sel_data['msg'].astype(str).apply(lambda x: x.replace('_', ' '))

In [4]:
train_data = pd.read_csv('data/preliminary_train_label_dataset.csv')
train_data2 = pd.read_csv('data/preliminary_train_label_dataset_s.csv')
train_data = pd.concat([train_data, train_data2])
train_data['fault_time'] = pd.to_datetime(train_data['fault_time'])
train_data.sort_values(by=['sn', 'fault_time'], inplace=True)
train_data.reset_index(drop=True, inplace=True)

print(train_data.shape)
train_data.head(10)

(16669, 3)


Unnamed: 0,sn,fault_time,label
0,SERVER_10001,2020-05-01 10:04:00,1
1,SERVER_10003,2020-03-28 09:48:00,2
2,SERVER_10008,2020-02-25 16:12:00,1
3,SERVER_10008,2020-03-11 18:04:00,2
4,SERVER_10009,2020-05-08 16:37:00,3
5,SERVER_10012,2020-07-13 03:32:00,3
6,SERVER_10017,2020-06-11 15:52:00,3
7,SERVER_10017,2020-06-11 15:52:00,3
8,SERVER_10018,2020-05-31 03:33:00,3
9,SERVER_10019,2020-01-29 22:38:00,3


In [5]:
test_data = pd.read_csv('data/preliminary_submit_dataset_a.csv')
test_data['fault_time'] = pd.to_datetime(test_data['fault_time'])

print(test_data.shape)
test_data.head(10)

(3011, 2)


Unnamed: 0,sn,fault_time
0,000d33b21436,2020-09-02 16:42:54
1,005c5a9218ba,2020-06-28 19:05:16
2,0079283bde6e,2020-04-26 21:32:44
3,007bdf23b62f,2020-06-16 18:40:39
4,00a577a8e54f,2020-04-07 07:16:55
5,00a85fb232bf,2020-05-27 03:24:09
6,00ae2639c426,2019-12-30 05:24:54
7,00b9c343ace4,2020-11-13 01:29:55
8,00bdcf2207d5,2020-01-04 13:39:40
9,00c76d7884f5,2020-07-16 21:22:54


# w2v model

In [6]:
tmp = sel_data.groupby(['sn'], as_index=False)['msg'].agg(list)
tmp['text'] = tmp['msg'].apply(lambda x: ("\n".join([i for i in x])).lower())
sentences_list = tmp['text'].values.tolist()

sentences = list()
for s in sentences_list:
    sentences.append([w for w in s.split()])

In [7]:
%%time

w2v_model = Word2Vec(sentences, size=32, window=3, min_count=5, sg=0, hs=1, seed=2022)

CPU times: user 18.1 s, sys: 181 ms, total: 18.3 s
Wall time: 7.34 s


In [8]:
def get_w2v_mean(sentences):
    emb_matrix = list()
    vec = list()
    for w in sentences.split():
        if w in w2v_model.wv:
            vec.append(w2v_model.wv[w])
    if len(vec) > 0:
        emb_matrix.append(np.mean(vec, axis=0))
    else:
        emb_matrix.append([0] * model.vector_size)
    return emb_matrix

# tf-idf model

In [9]:
%%time

X = list(tmp['text'].values)
tfv = TfidfVectorizer(ngram_range=(1,3), min_df=5, max_features=50000)
tfv.fit(X)

CPU times: user 4.76 s, sys: 25.7 ms, total: 4.78 s
Wall time: 4.78 s


TfidfVectorizer(max_features=50000, min_df=5, ngram_range=(1, 3))

In [10]:
%%time

X_tfidf = tfv.transform(X)
svd = TruncatedSVD(n_components=16)
svd.fit(X_tfidf)

CPU times: user 9.14 s, sys: 6.01 s, total: 15.1 s
Wall time: 5.35 s


TruncatedSVD(n_components=16)

In [11]:
def get_tfidf_svd(sentences, n_components=16):
    X_tfidf = tfv.transform(sentences)
    X_svd = svd.transform(X_tfidf)
    return np.mean(X_svd, axis=0)

# other features

In [12]:
sel_data['time_ts'] = sel_data["time"].values.astype(np.int64) // 10 ** 9
train_data['fault_time_ts'] = train_data["fault_time"].values.astype(np.int64) // 10 ** 9

In [13]:
def safe_split(strs, n, sep='|'):
    str_li = strs.split(sep)
    if len(str_li) >= n + 1:
        return str_li[n]
    else:
        return ''

sel_data['msg_split_0'] = sel_data['msg'].apply(lambda x: safe_split(x, 0))
sel_data['msg_split_1'] = sel_data['msg'].apply(lambda x: safe_split(x, 1))
sel_data['msg_split_2'] = sel_data['msg'].apply(lambda x: safe_split(x, 2))

sel_data['category'] = sel_data['msg'].apply(lambda x: x.split()[0])

In [14]:
cate_map = {
    'Memory': 0,
    'System': 1,
    'Processor': 2,
    'Temperature': 3,
    'Drive': 4,
    'Power': 5,
    'Unknown': 6,
    'Microcontroller': 7,
    'OS': 8,
    'Watchdog2': 9,
    'OEM': 10,
    'Button': 11,
    'Slot/Connector': 12,
    'Microcontroller/Coprocessor': 13,
    'Management': 14,
    'Event': 15,
    'Watchdog': 16,
    'Slot': 17,
    'Fan': 18,
    'Critical': 19,
    'device': 20,
    'LAN': 21,
    'Version': 22,
    'Add-in': 23,
    'Terminator': 24,
    'Chassis': 25,
    'reserved': 26,
    'Physical': 27,
    'Session': 28,
    'Reserved': 29,
    'Cable/Interconnect': 30,
    'Cable': 31,
    'Chip': 32,
    'Battery': 33
}

# make dataset

In [15]:
train_data.head()

Unnamed: 0,sn,fault_time,label,fault_time_ts
0,SERVER_10001,2020-05-01 10:04:00,1,1588327440
1,SERVER_10003,2020-03-28 09:48:00,2,1585388880
2,SERVER_10008,2020-02-25 16:12:00,1,1582647120
3,SERVER_10008,2020-03-11 18:04:00,2,1583949840
4,SERVER_10009,2020-05-08 16:37:00,3,1588955820


In [16]:
sel_data.head()

Unnamed: 0,sn,time,msg,server_model,time_ts,msg_split_0,msg_split_1,msg_split_2,category
0,000d33b21436,2020-09-02 11:38:40,System Boot Initiated BIOS_Boot_Up | Initiated by warm reset | Asserted,SM40,1599046720,System Boot Initiated BIOS_Boot_Up,Initiated by warm reset,Asserted,System
1,000d33b21436,2020-09-02 15:46:23,System Boot Initiated BIOS_Boot_Up | Initiated by power up | Asserted,SM40,1599061583,System Boot Initiated BIOS_Boot_Up,Initiated by power up,Asserted,System
2,005c5a9218ba,2020-06-28 18:26:14,Memory Memory_Status | Correctable ECC | Asserted,SM99,1593368774,Memory Memory_Status,Correctable ECC,Asserted,Memory
3,005c5a9218ba,2020-06-28 18:26:15,Memory Memory_Status | Correctable ECC | Asserted,SM99,1593368775,Memory Memory_Status,Correctable ECC,Asserted,Memory
4,005c5a9218ba,2020-06-28 18:26:20,Memory Memory_Status | Correctable ECC | Asserted,SM99,1593368780,Memory Memory_Status,Correctable ECC,Asserted,Memory


In [17]:
def make_dataset(dataset, data_type='train'):
    ret = list()

    for idx, row in tqdm(dataset.iterrows()):
        sn = row['sn']
        fault_time = row['fault_time']
        ts = row['fault_time_ts']
        
        if data_type == 'train':
            label = row['label']

        df = sel_data[sel_data['sn'] == sn].copy()

        df = df[df['time_ts'] <= ts].copy()
        df = df.sort_values(by='time_ts').reset_index(drop=True)
        df = df.tail(40).copy()        # TODO: could change last 40 logs here

        # make some features

        logs_count = len(df)

        if logs_count > 0:
            msg_nunique = df['msg'].nunique()
            msg_category_nunique = df['category'].nunique()
            msg_split_0_nunique = df['msg_split_0'].nunique()
            msg_split_1_nunique = df['msg_split_1'].nunique()
            msg_split_2_nunique = df['msg_split_2'].nunique()
            last_category = df['category'].value_counts().index[0]
            last_category = cate_map[last_category] if last_category in cate_map else len(cate_map)

            s = df['time_ts'].values
            if len(s) > 0:
                seconds_span = s[-1] - s[0] 
            else:
                seconds_span = 0

            df['time_ts_shift_1'] = df['time_ts'].shift(1)
            df['time_ts_diffs_1'] = df['time_ts'] - df['time_ts_shift_1']
            s = df['time_ts_diffs_1'].values
            if len(s) > 1:
                log_time_diffs_avg = np.mean(s[1:])
                log_time_diffs_max = np.max(s[1:])
                log_time_diffs_min = np.min(s[1:])
                log_time_diffs_std = np.std(s[1:])
            else:
                try:
                    log_time_diffs_avg = log_time_diffs_max = log_time_diffs_min = s[0]
                    log_time_diffs_std = 0
                except:
                    log_time_diffs_avg = log_time_diffs_max = log_time_diffs_min = log_time_diffs_std = 0

            all_msg = "\n".join(df['msg'].values.tolist()).lower()
            w2v_emb = get_w2v_mean(all_msg)[0]
            tfv_emb = get_tfidf_svd([s.lower() for s in df['msg'].values.tolist()])

        else:
            logs_count = 0
            msg_nunique = 0
            msg_category_nunique = 0
            msg_split_0_nunique = 0
            msg_split_1_nunique = 0
            msg_split_2_nunique = 0
            last_category = 0
            seconds_span = 0
            log_time_diffs_avg = 0
            log_time_diffs_max = 0
            log_time_diffs_min = 0
            log_time_diffs_std = 0
            w2v_emb = [0] * 32
            tfv_emb = [0] * 16


        # format dataset
        data = {
            'sn': sn,
            'fault_time': fault_time,
            'logs_count': logs_count,
            'msg_nunique': msg_nunique,
            'msg_category_nunique': msg_category_nunique,
            'msg_split_0_nunique': msg_split_0_nunique,
            'msg_split_1_nunique': msg_split_1_nunique,
            'msg_split_2_nunique': msg_split_2_nunique,
            'last_category': last_category,
            'seconds_span': seconds_span,
            'log_time_diffs_avg': log_time_diffs_avg,
            'log_time_diffs_max': log_time_diffs_max,
            'log_time_diffs_min': log_time_diffs_min,
            'log_time_diffs_std': log_time_diffs_std,
        }

        for i in range(32):
            data[f'msg_w2v_{i}'] = w2v_emb[i]
        for i in range(16):
            data[f'msg_tfv_{i}'] = tfv_emb[i]
            
        if data_type == 'train':
            data['label'] = label

        ret.append(data)
        
    return ret

In [18]:
train = make_dataset(train_data, data_type='train')
df_train = pd.DataFrame(train)

print(df_train.shape)
df_train.head()

0it [00:00, ?it/s]

(16669, 63)


Unnamed: 0,sn,fault_time,logs_count,msg_nunique,msg_category_nunique,msg_split_0_nunique,msg_split_1_nunique,msg_split_2_nunique,last_category,seconds_span,log_time_diffs_avg,log_time_diffs_max,log_time_diffs_min,log_time_diffs_std,msg_w2v_0,msg_w2v_1,msg_w2v_2,msg_w2v_3,msg_w2v_4,msg_w2v_5,msg_w2v_6,msg_w2v_7,msg_w2v_8,msg_w2v_9,msg_w2v_10,msg_w2v_11,msg_w2v_12,msg_w2v_13,msg_w2v_14,msg_w2v_15,msg_w2v_16,msg_w2v_17,msg_w2v_18,msg_w2v_19,msg_w2v_20,msg_w2v_21,msg_w2v_22,msg_w2v_23,msg_w2v_24,msg_w2v_25,msg_w2v_26,msg_w2v_27,msg_w2v_28,msg_w2v_29,msg_w2v_30,msg_w2v_31,msg_tfv_0,msg_tfv_1,msg_tfv_2,msg_tfv_3,msg_tfv_4,msg_tfv_5,msg_tfv_6,msg_tfv_7,msg_tfv_8,msg_tfv_9,msg_tfv_10,msg_tfv_11,msg_tfv_12,msg_tfv_13,msg_tfv_14,msg_tfv_15,label
0,SERVER_10001,2020-05-01 10:04:00,9,9,3,5,5,2,2,660,82.5,332.0,0.0,131.264047,-0.217888,1.208633,0.296688,-0.115229,0.84609,0.345754,0.448314,-0.306502,-0.967685,0.745146,-0.446742,-0.336726,-0.008031,0.041893,-0.174827,-0.58011,1.273012,0.033927,-0.149504,0.21758,-0.505119,0.401726,-0.777705,-0.289106,-0.80365,0.072503,0.476702,0.253577,0.265927,-0.458841,0.368485,-0.19769,0.151897,-0.159091,-0.045852,-0.137117,0.037583,0.017973,-0.026245,-0.061376,0.196765,-0.066078,0.158043,0.151925,0.005211,-0.047311,0.026295,0.000554,1
1,SERVER_10003,2020-03-28 09:48:00,40,1,1,1,1,1,0,57,1.461538,2.0,1.0,0.498519,-1.034287,0.749801,1.368028,-0.536806,-0.623691,0.082483,-0.31975,-0.565647,0.469923,1.197061,-0.41819,-0.411176,-0.592987,0.084794,0.395256,1.052139,0.979792,0.413143,-0.453026,1.233265,-0.092708,-0.23216,0.003912,-0.096487,-0.903889,0.105122,1.662945,0.001492,-0.579058,-1.299976,-0.143333,-0.637295,0.214217,0.163643,0.088812,-0.070356,0.039149,0.040125,-0.02225,-0.118726,-0.116474,-0.099837,-0.033302,-0.022175,0.025543,-0.096495,0.051933,0.109686,2
2,SERVER_10008,2020-02-25 16:12:00,5,3,2,2,3,1,2,38,9.5,33.0,0.0,13.720423,-0.016514,0.670285,0.409925,-0.762,0.733827,0.010211,-0.028148,-0.468673,0.32882,1.567746,-0.257818,0.116204,-0.717844,0.262964,0.337419,0.361013,1.478814,0.128773,-0.862744,2.005613,-1.026987,0.169311,-0.113072,-0.088615,-0.603793,0.128889,0.885037,-0.31499,0.368098,-0.563105,0.754832,-0.719132,0.133858,-0.04579,-0.007329,-0.141986,0.267737,0.133624,-0.034839,-0.001326,0.053113,0.078324,0.029695,-0.057792,-0.021669,-0.016241,-0.008756,-0.116243,1
3,SERVER_10008,2020-03-11 18:04:00,9,4,3,3,4,1,2,1299319,162414.875,1245629.0,0.0,409792.273236,-0.13224,0.729082,0.417491,-0.513734,0.804473,0.100944,0.027263,-0.38227,0.23883,1.4428,-0.262077,0.040022,-0.605642,0.095073,0.167375,-0.058379,1.44148,-0.101074,-0.521423,1.743551,-1.076107,-0.029283,-0.369161,0.153274,-0.911411,0.172905,0.949135,-0.368793,0.322986,-0.704995,0.878306,-0.650898,0.161708,-0.08611,-0.016968,-0.117727,0.249139,0.10745,0.008809,6.2e-05,0.062044,0.056138,0.023123,-0.041846,-0.018372,0.012661,-0.018019,-0.064874,2
4,SERVER_10009,2020-05-08 16:37:00,4,4,1,2,1,2,4,21,7.0,21.0,0.0,9.899495,0.588096,-0.892219,0.037189,-0.259585,-0.332723,-0.754736,-0.248926,-2.155067,0.164876,0.188227,-1.126425,-0.445908,0.239997,-0.756503,-1.060651,0.413973,-0.300656,-0.184463,0.117836,0.563673,-1.046141,1.066021,-0.791089,-1.751097,0.059783,0.368844,0.577923,-0.260079,-0.188612,-0.29341,-0.394862,-1.252688,0.011797,-0.006899,-0.000904,-0.01032,-0.005548,0.011192,-0.003254,-0.023901,-0.0119,0.035432,-0.011843,0.019424,0.025515,0.209779,0.406583,-0.035321,3


In [19]:
df_train = df_train[df_train['logs_count'] > 0].copy()
df_train.shape

(16571, 63)

In [20]:
test_data['fault_time_ts'] = test_data["fault_time"].values.astype(np.int64) // 10 ** 9

test = make_dataset(test_data, data_type='test')

df_test = pd.DataFrame(test)
print(df_test.shape)
df_test.head()

0it [00:00, ?it/s]

(3011, 62)


Unnamed: 0,sn,fault_time,logs_count,msg_nunique,msg_category_nunique,msg_split_0_nunique,msg_split_1_nunique,msg_split_2_nunique,last_category,seconds_span,log_time_diffs_avg,log_time_diffs_max,log_time_diffs_min,log_time_diffs_std,msg_w2v_0,msg_w2v_1,msg_w2v_2,msg_w2v_3,msg_w2v_4,msg_w2v_5,msg_w2v_6,msg_w2v_7,msg_w2v_8,msg_w2v_9,msg_w2v_10,msg_w2v_11,msg_w2v_12,msg_w2v_13,msg_w2v_14,msg_w2v_15,msg_w2v_16,msg_w2v_17,msg_w2v_18,msg_w2v_19,msg_w2v_20,msg_w2v_21,msg_w2v_22,msg_w2v_23,msg_w2v_24,msg_w2v_25,msg_w2v_26,msg_w2v_27,msg_w2v_28,msg_w2v_29,msg_w2v_30,msg_w2v_31,msg_tfv_0,msg_tfv_1,msg_tfv_2,msg_tfv_3,msg_tfv_4,msg_tfv_5,msg_tfv_6,msg_tfv_7,msg_tfv_8,msg_tfv_9,msg_tfv_10,msg_tfv_11,msg_tfv_12,msg_tfv_13,msg_tfv_14,msg_tfv_15
0,000d33b21436,2020-09-02 16:42:54,2,2,1,1,2,1,1,14863,14863.0,14863.0,14863.0,0.0,-0.753873,1.057238,0.477809,0.793333,1.108565,0.702695,0.263291,-0.057415,-0.05404,0.999615,-0.133941,-0.325981,-0.206978,-0.666128,-0.493825,-1.899342,1.274713,-1.129249,1.121789,0.487662,-1.351117,-1.047395,-1.838649,1.314916,-2.43545,0.423018,1.206109,-0.777733,0.231042,-1.548762,1.323158,-0.291478,0.365154,-0.356821,-0.03016,0.148515,0.194264,-0.112578,0.220841,-0.002503,0.120392,-0.108643,-0.130593,0.010103,0.021307,0.292689,-0.111781,0.379372
1,005c5a9218ba,2020-06-28 19:05:16,10,4,2,3,4,1,0,867,96.333333,739.0,1.0,229.054579,-0.571692,1.047588,0.587682,-0.1076,0.064156,0.435103,0.090147,-0.238696,0.141751,1.123357,-0.529058,0.032353,-0.27471,-0.091024,0.221388,-0.329722,0.926039,-0.377926,0.033083,0.915815,-0.626002,-0.61102,-0.925958,0.27036,-1.092039,0.269048,1.482984,-0.244683,-0.19484,-1.62639,0.616099,-0.438819,0.237742,0.013483,0.062172,-0.031701,0.038269,-0.009773,-0.012846,-0.148332,-0.104783,-0.235336,-0.089584,0.165009,-0.355942,0.191743,-0.084205,-0.042085
2,0079283bde6e,2020-04-26 21:32:44,1,1,1,1,1,1,5,0,,,,0.0,0.792417,0.201313,2.012014,-0.315816,0.971143,0.120178,0.531646,-0.878379,-0.959093,1.054533,-1.644524,-0.334169,-0.007647,-0.60646,-0.359124,-0.745971,1.100567,-1.17399,0.250237,0.827774,-0.489288,0.214281,-0.213678,-0.002126,0.267316,0.461106,1.508034,0.003735,-0.125584,-0.911711,0.460587,0.755296,0.052623,-0.044621,-0.018091,-0.061272,-0.093788,0.022315,0.004164,-0.140509,-0.038805,0.227,-0.077624,0.083432,0.002896,0.014371,-0.079419,0.040265
3,007bdf23b62f,2020-06-16 18:40:39,19,5,3,4,5,1,0,2477,137.611111,760.0,0.0,232.859552,-0.559256,0.860653,0.670222,-0.420873,-0.226519,0.577539,0.13489,-0.446906,0.224866,1.337413,-0.452966,-0.095188,-0.500265,0.071089,0.228479,-0.082805,0.769691,-0.347532,0.120344,0.876847,-0.34207,-0.445987,-0.72218,0.265345,-1.263551,0.395806,1.826025,-0.354625,-0.422129,-1.514747,0.179367,-0.473458,0.479957,0.359418,-0.236787,0.06404,-0.070917,-0.0242,-0.016113,0.031406,0.088692,-0.00685,0.021154,-0.017733,-0.002959,0.023708,-0.014601,-0.007373
4,00a577a8e54f,2020-04-07 07:16:55,6,6,3,5,5,1,8,563,112.6,369.0,0.0,134.117262,-0.140184,0.108272,0.456683,-0.129057,0.099876,-0.03686,-0.47906,-0.088692,0.261125,1.217673,0.59571,-0.088336,-0.503732,-0.190407,-0.650788,-0.261678,0.927499,-0.985841,-0.09499,0.34635,-1.282823,-0.745438,-0.261161,0.363802,-0.872187,0.634037,1.489976,-0.407625,0.098704,-0.865069,-0.579159,0.139504,0.106396,-0.035529,0.036857,0.088155,0.053849,-0.097694,-0.078716,-0.070665,-0.138531,0.062157,0.22191,-0.055554,-0.006756,0.04075,-0.023474,-0.002602


In [21]:
df_test[df_test['logs_count'] == 0].shape

(0, 62)

# catboost model

In [22]:
classes = np.unique(df_train['label'])
weights = compute_class_weight(class_weight='balanced', classes=classes, y=df_train['label'])
class_weights = dict(zip(classes, weights))

class_weights

{0: 2.8086440677966102,
 1: 1.2249408633944412,
 2: 0.4468503937007874,
 3: 1.6957634056487925}

In [23]:
NUM_CLASSES = df_train['label'].nunique()
FOLDS = 10
TARGET = 'label'
use_features = [col for col in df_train.columns if col not in ['sn', 'fault_time', TARGET]]

def run_ctb(df_train, df_test, use_features):
    target = TARGET
    oof_pred = np.zeros((len(df_train), NUM_CLASSES))
    y_pred = np.zeros((len(df_test), NUM_CLASSES))
    
    folds = GroupKFold(n_splits=FOLDS)
    for fold, (tr_ind, val_ind) in enumerate(folds.split(df_train, df_train[TARGET], df_train['sn'])):
        print(f'Fold {fold + 1}') 
        x_train, x_val = df_train[use_features].iloc[tr_ind], df_train[use_features].iloc[val_ind] 
        y_train, y_val = df_train[target].iloc[tr_ind], df_train[target].iloc[val_ind]
        
        params = { 
            'task_type': 'GPU', 
            'bootstrap_type': 'Bernoulli',
            'learning_rate': 0.1, 
            'eval_metric': 'MultiClass', 
            'loss_function': 'MultiClass', 
            'classes_count': NUM_CLASSES, 
            'iterations': 1000, 
            'random_seed': 2022, 
            'depth': 8, 
            'subsample': 0.8, 
            'leaf_estimation_iterations': 8,
            'reg_lambda': 0.5,
            'class_weights': class_weights,
            'early_stopping_rounds': 100 
        }
        model = CatBoostClassifier(**params)
        
        model.fit(x_train, 
                  y_train, 
                  eval_set=(x_val, y_val), 
                  verbose=100) 
        oof_pred[val_ind] = model.predict_proba(x_val) 
        y_pred += model.predict_proba(df_test[use_features]) / folds.n_splits
        
        score = f1_score(y_val, oof_pred[val_ind].argmax(axis=1), average='macro')
        print(f'F1 score: {score}')
        
        print("Features importance...")
        feat_imp = pd.DataFrame({'imp': model.feature_importances_, 'feature': use_features})
        print(feat_imp.sort_values(by='imp').reset_index(drop=True))
        
        del x_train, x_val, y_train, y_val
        gc.collect()
        
    return y_pred, oof_pred

In [24]:
y_pred, oof_pred = run_ctb(df_train, df_test, use_features)

Fold 1
0:	learn: 1.2312418	test: 1.2270049	best: 1.2270049 (0)	total: 17.6ms	remaining: 17.6s
100:	learn: 0.3336227	test: 0.6496774	best: 0.6328305 (64)	total: 1.44s	remaining: 12.9s
bestTest = 0.6328305086
bestIteration = 64
Shrink model to first 65 iterations.
F1 score: 0.6964031326876516
Features importance...
         imp               feature
0   0.127643            msg_w2v_16
1   0.135408             msg_w2v_0
2   0.417604            msg_w2v_21
3   0.435707            msg_w2v_25
4   0.444410            msg_w2v_20
5   0.470017            msg_w2v_18
6   0.514341             msg_tfv_6
7   0.516179            msg_w2v_27
8   0.527085    log_time_diffs_min
9   0.543440            msg_tfv_13
10  0.573166            msg_w2v_11
11  0.578881            msg_w2v_23
12  0.581546            msg_tfv_14
13  0.588343             msg_w2v_5
14  0.688612             msg_w2v_8
15  0.754191    log_time_diffs_avg
16  0.777324            msg_w2v_28
17  0.856202            msg_w2v_10
18  0.865254        

Fold 5
0:	learn: 1.2251478	test: 1.2391057	best: 1.2391057 (0)	total: 16.6ms	remaining: 16.6s
100:	learn: 0.3319104	test: 0.6965695	best: 0.6820318 (46)	total: 1.42s	remaining: 12.7s
bestTest = 0.682031847
bestIteration = 46
Shrink model to first 47 iterations.
F1 score: 0.6790108826502732
Features importance...
         imp               feature
0   0.068313            msg_w2v_28
1   0.129755             msg_tfv_2
2   0.208915            msg_w2v_25
3   0.241929            msg_w2v_18
4   0.270418            msg_w2v_11
5   0.363373            msg_w2v_21
6   0.363728            msg_w2v_23
7   0.474590    log_time_diffs_std
8   0.546135    log_time_diffs_min
9   0.572601            msg_w2v_16
10  0.582243            msg_tfv_10
11  0.640874             msg_w2v_0
12  0.733339             msg_tfv_0
13  0.733739             msg_tfv_9
14  0.792419             msg_w2v_9
15  0.848386            msg_w2v_20
16  0.848428             msg_w2v_3
17  0.860503            msg_w2v_27
18  0.911110         

Fold 9
0:	learn: 1.2214506	test: 1.2387952	best: 1.2387952 (0)	total: 15.6ms	remaining: 15.6s
100:	learn: 0.3275246	test: 0.6828768	best: 0.6705853 (53)	total: 1.44s	remaining: 12.8s
bestTest = 0.6705853292
bestIteration = 53
Shrink model to first 54 iterations.
F1 score: 0.6717562142507827
Features importance...
         imp               feature
0   0.035971             msg_w2v_5
1   0.083185            msg_w2v_21
2   0.270260             msg_w2v_8
3   0.278124            msg_w2v_27
4   0.282855            msg_w2v_28
5   0.420640            msg_tfv_13
6   0.454020            msg_w2v_11
7   0.535261             msg_tfv_2
8   0.558401             msg_w2v_9
9   0.570413            msg_w2v_10
10  0.579360            msg_w2v_18
11  0.614733    log_time_diffs_max
12  0.695368            msg_tfv_15
13  0.708675    log_time_diffs_min
14  0.739540            msg_w2v_25
15  0.775678             msg_w2v_7
16  0.800470            msg_w2v_16
17  0.861683            msg_w2v_23
18  0.866741    log_

In [25]:
target_df = df_train[['sn', 'fault_time', 'label']].copy()
oof_df = target_df.copy()
oof_df['label'] = oof_pred.argmax(axis=1)

def  macro_f1(target_df: pd.DataFrame,  submit_df: pd.DataFrame)  -> float:

    """
    计算得分
    :param target_df: [sn,fault_time,label]
    :param submit_df: [sn,fault_time,label]
    :return:
    """

    weights =  [3/7,  2/7,  1/7,  1/7]

    overall_df = target_df.merge(submit_df, how='left', on=['sn', 'fault_time'], suffixes=['_gt', '_pr'])
    overall_df.fillna(-1)

    macro_F1 =  0.
    for i in  range(len(weights)):
        TP =  len(overall_df[(overall_df['label_gt'] == i) & (overall_df['label_pr'] == i)])
        FP =  len(overall_df[(overall_df['label_gt'] != i) & (overall_df['label_pr'] == i)])
        FN =  len(overall_df[(overall_df['label_gt'] == i) & (overall_df['label_pr'] != i)])
        precision = TP /  (TP + FP)  if  (TP + FP)  >  0  else  0
        recall = TP /  (TP + FN)  if  (TP + FN)  >  0  else  0
        F1 =  2  * precision * recall /  (precision + recall)  if  (precision + recall)  >  0  else  0
        macro_F1 += weights[i]  * F1
    return macro_F1


macro_f1(target_df, oof_df)

0.5859183210179724

In [26]:
sub = df_test[['sn', 'fault_time']].copy()
sub['label'] = y_pred.argmax(axis=1)
display(sub.head())
sub['label'].value_counts()

Unnamed: 0,sn,fault_time,label
0,000d33b21436,2020-09-02 16:42:54,3
1,005c5a9218ba,2020-06-28 19:05:16,2
2,0079283bde6e,2020-04-26 21:32:44,3
3,007bdf23b62f,2020-06-16 18:40:39,2
4,00a577a8e54f,2020-04-07 07:16:55,2


2    1618
3     571
1     525
0     297
Name: label, dtype: int64

In [27]:
sub.to_csv('baseline2_gkf_sn.csv', index=False)