In [None]:
import os
import pandas as pd
import numpy as np

import xgboost as xgb
import matplotlib.pyplot as plt

# from sklearn.model_selection import GridSearchCV
from utils.F import *

# Load Data

In [None]:
# Set seed
seed = 55688

In [None]:
# Read Data
data_folder = '/home/wmnlab/Documents/sheng-ru/HO-Prediction/data/vv2'
data_list = [os.path.join(data_folder, file) for file in os.listdir(data_folder)]
data_list.remove(os.path.join(data_folder, 'record.csv'))
test_data_list1 = [x for x in data_list if ('2023-11-01' in x and '#02' in x)] # 同一天機捷 
test_data_list2 = [x for x in data_list if '2023-11-02' in x] # 機捷
test_data_list3 = [x for x in data_list if '2023-11-09' in x] # 棕線
test_data_list4 = test_data_list1 + test_data_list2 + test_data_list3
train_data_list = [x for x in data_list if x not in test_data_list1 + test_data_list2 + test_data_list3]

time_seq_len = 10
pred_time = 3

features = ['num_of_neis', 'RSRP','RSRQ','RSRP1','RSRQ1','nr-RSRP','nr-RSRQ','nr-RSRP1','nr-RSRQ1',
            'E-UTRAN-eventA3','eventA5','NR-eventA3','eventB1-NR-r15',
            'LTE_HO','MN_HO','MN_HO_to_eNB','SN_setup','SN_Rel','SN_HO', 
            'RLF_II', 'RLF_III','SCG_RLF']
ffill_cols = ['RSRP1', 'RSRQ1']
two_hot_vec_cols = ['E-UTRAN-eventA3','eventA5','NR-eventA3','eventB1-NR-r15',
            'LTE_HO','MN_HO','MN_HO_to_eNB','SN_setup','SN_Rel','SN_HO','RLF_II','RLF_III','SCG_RLF']
merged_cols = [['LTE_HO', 'MN_HO_to_eNB', 'LTE_HO'], ['RLF_II', 'RLF_III', 'RLF']]

X_train, y_cls_train, y_fst_train, record_train = ts_array_create(train_data_list, time_seq_len, pred_time, features, ffill_cols,two_hot_vec_cols,merged_cols)
X_train_2d = X_train.reshape(X_train.shape[0], -1)

X_test1, y_cls_test1, y_fst_test1, record_test1 = ts_array_create(test_data_list1, time_seq_len, pred_time, features, ffill_cols,two_hot_vec_cols,merged_cols)
X_test1_2d = X_test1.reshape(X_test1.shape[0], -1)

X_test2, y_cls_test2, y_fst_test2, record_test2 = ts_array_create(test_data_list2, time_seq_len, pred_time, features, ffill_cols,two_hot_vec_cols,merged_cols)
X_test2_2d = X_test2.reshape(X_test2.shape[0], -1)

X_test3, y_cls_test3, y_fst_test3, record_test3 = ts_array_create(test_data_list3, time_seq_len, pred_time, features, ffill_cols,two_hot_vec_cols,merged_cols)
X_test3_2d = X_test3.reshape(X_test3.shape[0], -1)

X_test4, y_cls_test4, y_fst_test4, record_test4 = ts_array_create(test_data_list4, time_seq_len, pred_time, features, ffill_cols,two_hot_vec_cols,merged_cols)
X_test4_2d = X_test4.reshape(X_test4.shape[0], -1)

In [None]:
X_train.shape, y_cls_train.shape

In [None]:
# Count RLF number
rlf_num_train = count_rlf(train_data_list)
rlf_num_test1 = count_rlf(test_data_list1)
rlf_num_test2 = count_rlf(test_data_list2)
rlf_num_test3 = count_rlf(test_data_list3)
rlf_num_test4 = count_rlf(test_data_list4)
print(f'RLF # in training data: {rlf_num_train}\nRLF # in testing data1: {rlf_num_test1}\nRLF # in testing data2: {rlf_num_test2}\nRLF # in testing data3: {rlf_num_test3}\n')

In [None]:
# 將數據轉換為 DMatrix 格式
dtrain = xgb.DMatrix(X_train_2d, label=y_cls_train)
dtest1 = xgb.DMatrix(X_test1_2d, label=y_cls_test1)
dtest2 = xgb.DMatrix(X_test2_2d, label=y_cls_test2)
dtest3 = xgb.DMatrix(X_test3_2d, label=y_cls_test3)
dtest4 = xgb.DMatrix(X_test4_2d, label=y_cls_test4)

# Train

In [None]:
# xgb parameters
params = {
    'objective': 'binary:logistic', 
    'eval_metric': 'error',  
    'max_depth': 8,
    'learning_rate': 0.05,
    'subsample': 1,
    'colsample_bytree': 0.8,
    'alpha': 0.01,
    'lambda':1.0,
    'seed': seed,
    'tree_method': 'hist',
    'device': 'cuda:0'
}

In [None]:
# Model Create
num_rounds = 1000
watchlist = [(dtrain, 'train'), (dtest1, 'test1'), (dtest2, 'test2'), (dtest3, 'test3')] 
model = xgb.train(params, dtrain, num_rounds, evals=watchlist, early_stopping_rounds=200,  verbose_eval=True)

In [None]:
# Metric calculate
performance(model, dtrain, y_cls_train)
performance(model, dtest1, y_cls_test1)
performance(model, dtest2, y_cls_test2)
performance(model, dtest3, y_cls_test3)
performance(model, dtest4, y_cls_test4)
pass

In [None]:
# save model
save_path = '../model/rlf_cls_xgb.json'
config = model.save_model(save_path)

# how to load
# model2 = xgb.Booster()
# model2.load_model(save_path)


# Tuning

In [None]:
# xgb parameters
params = {
    'objective': 'binary:logistic', 
    'eval_metric': 'error',  
    'max_depth': 8,
    'learning_rate': 0.1,
    'subsample': 0.8, # on data
    'colsample_bytree': 0.8, # on feature
    'lambda': 0.01, # L2
    'alpha': 0.01, # L1
    'seed': seed,
    'tree_method': 'hist',
    'device': 'cuda:0'
}

# Model Create
num_rounds = 200
# watchlist = [(dtrain, 'train')]
# watchlist = [(dtrain, 'train'), (dtest4, 'test4')] 
watchlist = [(dtrain, 'train'), (dtest1, 'test1'), (dtest2, 'test2'), (dtest3, 'test3')] 
model = xgb.train(params, dtrain, num_rounds, evals=watchlist,  early_stopping_rounds=20, verbose_eval=True)

# Metric calculate
performance(model, dtrain, y_cls_train)
performance(model, dtest1, y_cls_test1)
performance(model, dtest2, y_cls_test2)
performance(model, dtest3, y_cls_test3)
performance(model, dtest4, y_cls_test4)
pass

## Grid Search

In [None]:
from itertools import product

watchlist = [(dtrain, 'train'), (dtest1, 'test1'), (dtest2, 'test2'), (dtest3, 'test3')] 
    
# Define parameters range
learning_rate_values = [0.05, 0.1, 0.2]
max_depth_values = [4, 5, 6, 7, 8]
subsample_values = [0.8, 0.9, 1.0]
colsample_bytree_values = [0.8, 0.9, 1.0]
alphas = [0.01,0.1,1]
lambdas = [0.01,0.1,1]
num_rounds_values = [50,100,200,300]
r = product(learning_rate_values, max_depth_values, subsample_values, colsample_bytree_values, alphas, lambdas)

savefile = '../info/xgb_record_no_early_stopping.csv'
with open(savefile, 'w') as f:
    print('lr, max_d, s_sample, cols_bytree, alpha, lambda, n,ACC(train), AUC(train), AUCPR(train), P(train), R(train), F1(train), ACC(train), AUC(test), AUCPR(test), P(test), R(test), F1(test)',file=f)
    for lr, d, s, cbt, a, l in tqdm(r):
        params = {'objective': 'binary:logistic', 'eval_metric': 'error',  'seed': seed,'tree_method': 'hist','device': 'cuda:0'}
        params['learning_rate'] = lr
        params['max_depth'] = d
        params['subsample'] = s
        params['colsample_bytree'] = cbt
        params['alpha'] = a
        params['lambda'] = l
        for num_rounds in num_rounds_values:
            model = xgb.train(params, dtrain, num_rounds, evals=watchlist,  early_stopping_rounds=20, verbose_eval=False)
            
            record = [lr, d, s, cbt, a, l, num_rounds]
            record+=list(performance(model, dtrain, y_cls_train))
            performance(model, dtest1, y_cls_test1)
            performance(model, dtest2, y_cls_test2)
            performance(model, dtest3, y_cls_test3)
            record += list(performance(model, dtest4, y_cls_test4))
            
            params.clear()
            record = [str(x) for x in record]
            
            print(','.join(record),end='\n', file=f)
            

In [None]:
params = {
    'objective': 'binary:logistic', 
    'eval_metric': 'error',  
    'max_depth': 8,
    'learning_rate': 0.1,
    'subsample': 0.8, # on data
    'colsample_bytree': 0.8, # on feature
    'lambda': 0, # L2
    'alpha': 0, # L1
    'seed': seed,
    'tree_method': 'hist',
    'device': 'cuda:0'
}

In [None]:
params.clear()

# Debugging

In [None]:
# feature columns
cols = ['num_of_neis', 'RSRP','RSRQ','RSRP1','RSRQ1','nr-RSRP','nr-RSRQ','nr-RSRP1','nr-RSRQ1',
        'E-UTRAN-eventA3','eventA5','NR-eventA3','eventB1-NR-r15',
        'LTE_HO','MN_HO','SN_setup','SN_Rel','SN_HO', 'RLF','SCG_RLF']


In [None]:
TP, FP, TN, FN = get_pred_result_ind(model, dtest4, y_cls_test4, X_test4)
len(TP), len(FP), len(TN), len(FN)

In [None]:
FP_cause = []
for i in FP:
    count = 1
    while count<=pred_time:
        df, ff = find_original_input(i+count, record_test4, time_seq_len, ffill_cols)
        series = df.iloc[-1]
        if (series['RLF_II'] or series['RLF_III']):
            FP_cause.append(series['RLF_cause'])
            break
        count+=1        

count = {'handoverFailure':0, 'otherFailure':0, 'reconfigurationFailure':0}
for c in FP_cause:
    for type in count.keys():
        if c == type: count[type] += 1
count

In [None]:
# Specify 
dtest = dtest4
y_cls, y_fst, X_test = y_cls_test4, y_fst_test4, X_test4

y_pred_proba = model.predict(dtest) 
y_pred = (y_pred_proba > 0.5).astype(int)

FP_input = []
FP_input_ind = []
FP_time = []

FN_input = []
FN_input_ind = []

TP_input = []
TP_input_ind = []
TP_time = []

for i, (pred, label, t, x) in enumerate(zip(y_pred, y_cls, y_fst, X_test)):
    if pred != label:
        if label == 1: # FP analysis
            FP_time.append(t)
            x = pd.DataFrame(x, columns=features)
            FP_input.append(x)
            FP_input_ind.append(i)
        else: # FN analysis
            x = pd.DataFrame(x, columns=features)
            FN_input.append(x)
            FN_input_ind.append(i)
    else: 
        if label == 1: # TP analysis
            TP_time.append(t)
            x = pd.DataFrame(x, columns=features)
            TP_input.append(x)
            TP_input_ind.append(i)
            
len(TP_input), len(FP_input), len(FN_input)

In [None]:
# Excel filename
file_record = record_test4
excel_file = '../info/TP_data.xlsx'
input_ind = TP_input_ind

# ExcelWriter 
with pd.ExcelWriter(excel_file, engine='xlsxwriter') as writer:
    for ind in tqdm(input_ind):
        # x = X_test[ind] 
        # df = pd.DataFrame(x, columns=features)
        df, tar_file = find_original_input(ind, file_record, time_seq_len)
        df['filename'] = [tar_file] + [None]*(len(df) - 1)
        df.to_excel(writer, sheet_name=f'{ind}', index=False)

In [None]:
df, _ = find_original_input(316, file_record, time_seq_len)

In [None]:
from pprint import pprint
pprint(TP_input_ind[:20])
pprint(TP_time[:20])

In [None]:
x = X_test4_2d[621]
x = np.expand_dims(x, axis=0)
X = xgb.DMatrix(x)
model.predict(X)

In [None]:
len(FN_input)/1490

In [None]:
# Failed CDF
sorted_data = np.sort(FP_time)
cumulative_distribution = np.arange(1, len(sorted_data) + 1) / (len(sorted_data)+len(FN_input))

plt.ylim([0,1])
plt.plot(sorted_data, cumulative_distribution, marker='o', linestyle='-', color='b')
plt.xlabel('Time away from RLF (second)')
# plt.ylabel('Cumulative Distribution Function (CDF)')
plt.title('CDF of the false prediced Data')
plt.grid(True)
plt.show()

In [None]:
# feature importance
importance = model.get_score(importance_type='gain')

sorted_importance = {}
for k, v in importance.items():
    num = int(k[1:])
    feature_name = features[num%len(features)]
    sorted_importance[f'{feature_name} {time_seq_len-num//34}'] = v

sorted_importance = sorted(sorted_importance.items(), key=lambda x: x[1], reverse=True)

In [None]:
# Plot Feature Importance
top_features = 20

data, labels = [], []
for f, score in reversed(sorted_importance[:top_features]):
    data.append(round(score))
    labels.append(f)

bars = plt.barh(labels, data)
plt.bar_label(bars)

# 設置標題和標籤
plt.title('Feature importance')
plt.xlabel('F score (gain)')
plt.ylabel('Features')

plt.grid()

# 顯示圖表
plt.show()