In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

data = pd.read_csv("avail_data.csv", parse_dates=['측정시간'], encoding='cp949').sort_values('측정시간').dropna().reset_index(drop=True)
data

Unnamed: 0,측정시간,온오프라인여부,상향파워1,상향파워2,상향SNR,하향파워,하향SNR,셀번호
0,2024-04-01 00:20:00,online,-4.0,36.0,30.0,-6.0,38.0,YSJBF3-2
1,2024-04-01 00:20:00,online,1.0,52.0,30.0,5.0,38.0,YSHS0015
2,2024-04-01 00:20:00,online,-1.0,47.0,30.0,0.0,38.0,YSYW0009
3,2024-04-01 00:20:00,online,0.0,26.0,34.0,5.0,42.0,YSHS0043
4,2024-04-01 00:20:00,online,0.0,41.0,30.0,0.0,36.0,YSYW0063B
...,...,...,...,...,...,...,...,...
235578,2024-05-01 23:00:00,online,3.0,38.0,26.0,-3.0,38.0,YSDGHFC0054
235579,2024-05-01 23:00:00,w-online,0.0,40.0,32.0,2.0,43.0,YSWS0154
235580,2024-05-01 23:00:00,online,0.0,50.0,33.0,2.0,42.0,YSWS0051
235581,2024-05-01 23:00:00,w-online,0.0,41.0,31.0,-2.0,35.0,YSWS6-2


In [2]:
outlier = pd.read_csv("장애내역.csv", encoding='cp949').drop_duplicates().reset_index(drop=True)
outlier['측정시간'] = outlier['측정시간'].str.replace("+09:00", "")
outlier['측정시간'] = pd.to_datetime(outlier['측정시간'])
outlier

Unnamed: 0,셀번호,측정시간,장애여부
0,YSWS4-5,2024-04-01 10:34:43,MAJOR
1,YSSB1-6,2024-04-01 11:13:33,MAJOR
2,YSWS0244,2024-04-01 15:22:45,CRITICAL
3,YSWSG3-4B,2024-04-02 10:54:25,MAJOR
4,YSJB5-25,2024-04-02 13:41:30,MAJOR
...,...,...,...
181,YSYW0030,2024-04-30 09:33:27,MAJOR
182,YSYW0030,2024-04-30 10:01:47,CRITICAL
183,YSWS8-5,2024-04-30 14:31:26,MAJOR
184,YSYW0009,2024-04-30 14:44:35,MAJOR


In [3]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

data = data.drop(columns=['상향파워1'])

def find_nearest_time(df, target_time, cell_number):
    if cell_number in df['셀번호'].unique():
        temp_df = df[df['측정시간'] <= target_time]
        temp_df = temp_df[temp_df['셀번호'] == cell_number]
        
        if len(temp_df) != 0:
            return temp_df.iloc[-1]

merged = []
for _, row in outlier.iterrows():
    nearest_row = find_nearest_time(data, row['측정시간'], row['셀번호'])
    if nearest_row is not None:
        nearest_row['장애여부'] = 1
        merged.append(nearest_row)

merged_df = pd.DataFrame(merged)
merged_df = merged_df.drop_duplicates()
merged_df

Unnamed: 0,측정시간,온오프라인여부,상향파워2,상향SNR,하향파워,하향SNR,셀번호,장애여부
2164,2024-04-01 09:00:00,online,42.0,30.0,-4.0,39.0,YSWS4-5,1
2686,2024-04-01 10:10:00,online,33.0,31.0,-6.0,35.0,YSSB1-6,1
11427,2024-04-02 10:50:00,online,35.0,33.0,6.0,42.0,YSWSG3-4B,1
11350,2024-04-02 10:30:00,online,30.0,34.0,3.0,36.0,YSJB5-25,1
20268,2024-04-03 08:50:00,online,40.0,30.0,9.0,42.0,YSJSC2003,1
...,...,...,...,...,...,...,...,...
226336,2024-04-30 09:00:00,w-online,54.0,26.0,16.0,41.0,YSYW0030,1
226497,2024-04-30 10:00:00,online,55.0,30.0,0.0,39.0,YSYW0030,1
209645,2024-04-27 21:45:00,online,48.0,31.0,-4.0,39.0,YSWS8-5,1
227588,2024-04-30 14:35:00,w-online,42.0,30.0,-7.0,38.0,YSYW0009,1


In [4]:
data['장애여부'] = 0
data['장애여부'].loc[merged_df.index] = 1
# data['온오프라인여부'].value_counts()

In [5]:
data['일'] = data['측정시간'].dt.day
data['시'] = data['측정시간'].dt.hour
data['분'] = data['측정시간'].dt.minute
data['요일'] = data['측정시간'].dt.weekday

ob_time = data['측정시간']
data

Unnamed: 0,측정시간,온오프라인여부,상향파워2,상향SNR,하향파워,하향SNR,셀번호,장애여부,일,시,분,요일
0,2024-04-01 00:20:00,online,36.0,30.0,-6.0,38.0,YSJBF3-2,0,1,0,20,0
1,2024-04-01 00:20:00,online,52.0,30.0,5.0,38.0,YSHS0015,0,1,0,20,0
2,2024-04-01 00:20:00,online,47.0,30.0,0.0,38.0,YSYW0009,0,1,0,20,0
3,2024-04-01 00:20:00,online,26.0,34.0,5.0,42.0,YSHS0043,0,1,0,20,0
4,2024-04-01 00:20:00,online,41.0,30.0,0.0,36.0,YSYW0063B,0,1,0,20,0
...,...,...,...,...,...,...,...,...,...,...,...,...
235578,2024-05-01 23:00:00,online,38.0,26.0,-3.0,38.0,YSDGHFC0054,0,1,23,0,2
235579,2024-05-01 23:00:00,w-online,40.0,32.0,2.0,43.0,YSWS0154,0,1,23,0,2
235580,2024-05-01 23:00:00,online,50.0,33.0,2.0,42.0,YSWS0051,0,1,23,0,2
235581,2024-05-01 23:00:00,w-online,41.0,31.0,-2.0,35.0,YSWS6-2,0,1,23,0,2


In [6]:
from sklearn.preprocessing import LabelEncoder

le1 = LabelEncoder()
le2 = LabelEncoder()

data['온오프라인여부'] = le1.fit_transform(data['온오프라인여부'])
data['셀번호'] = le2.fit_transform(data['셀번호'])

data = data.drop(columns=['온오프라인여부'])

data

Unnamed: 0,측정시간,상향파워2,상향SNR,하향파워,하향SNR,셀번호,장애여부,일,시,분,요일
0,2024-04-01 00:20:00,36.0,30.0,-6.0,38.0,42,0,1,0,20,0
1,2024-04-01 00:20:00,52.0,30.0,5.0,38.0,8,0,1,0,20,0
2,2024-04-01 00:20:00,47.0,30.0,0.0,38.0,92,0,1,0,20,0
3,2024-04-01 00:20:00,26.0,34.0,5.0,42.0,16,0,1,0,20,0
4,2024-04-01 00:20:00,41.0,30.0,0.0,36.0,96,0,1,0,20,0
...,...,...,...,...,...,...,...,...,...,...,...
235578,2024-05-01 23:00:00,38.0,26.0,-3.0,38.0,2,0,1,23,0,2
235579,2024-05-01 23:00:00,40.0,32.0,2.0,43.0,64,0,1,23,0,2
235580,2024-05-01 23:00:00,50.0,33.0,2.0,42.0,62,0,1,23,0,2
235581,2024-05-01 23:00:00,41.0,31.0,-2.0,35.0,88,0,1,23,0,2


In [7]:
def add_lag_features(df, group_col, target_cols, lag_num):
    temp_df = df.copy()
    for col in target_cols:
        for lag in range(1, lag_num + 1):
            temp_df[f'{col}_lag{lag}'] = temp_df.groupby(group_col)[col].shift(lag)
    return temp_df

target_cols = ['상향파워2', '상향SNR', '하향파워', '하향SNR']

lag_num=5

data_with_lags = add_lag_features(data, '셀번호', target_cols, lag_num)

data_with_lags = data_with_lags.dropna().reset_index(drop=True)
data_with_lags

Unnamed: 0,측정시간,상향파워2,상향SNR,하향파워,하향SNR,셀번호,장애여부,일,시,분,...,하향파워_lag1,하향파워_lag2,하향파워_lag3,하향파워_lag4,하향파워_lag5,하향SNR_lag1,하향SNR_lag2,하향SNR_lag3,하향SNR_lag4,하향SNR_lag5
0,2024-04-01 00:20:00,42.0,29.0,-2.0,36.0,28,0,1,0,20,...,7.0,-2.0,-2.0,10.0,2.0,37.0,37.0,37.0,38.0,38.0
1,2024-04-01 00:20:00,35.0,34.0,-1.0,37.0,2,0,1,0,20,...,0.0,-2.0,1.0,1.0,-2.0,37.0,35.0,37.0,37.0,36.0
2,2024-04-01 00:20:00,43.0,26.0,3.0,37.0,28,0,1,0,20,...,-2.0,7.0,-2.0,-2.0,10.0,36.0,37.0,37.0,37.0,38.0
3,2024-04-01 00:20:00,46.0,28.0,-4.0,38.0,2,0,1,0,20,...,-1.0,0.0,-2.0,1.0,1.0,37.0,37.0,35.0,37.0,37.0
4,2024-04-01 00:20:00,45.0,28.0,4.0,38.0,2,0,1,0,20,...,-4.0,-1.0,0.0,-2.0,1.0,38.0,37.0,37.0,35.0,37.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
235086,2024-05-01 23:00:00,38.0,26.0,-3.0,38.0,2,0,1,23,0,...,3.0,-1.0,0.0,-5.0,-9.0,38.0,38.0,37.0,36.0,38.0
235087,2024-05-01 23:00:00,40.0,32.0,2.0,43.0,64,0,1,23,0,...,-5.0,2.0,23.0,5.0,4.0,41.0,43.0,44.0,41.0,41.0
235088,2024-05-01 23:00:00,50.0,33.0,2.0,42.0,62,0,1,23,0,...,1.0,2.0,9.0,1.0,0.0,41.0,41.0,41.0,41.0,40.0
235089,2024-05-01 23:00:00,41.0,31.0,-2.0,35.0,88,0,1,23,0,...,-1.0,-2.0,-1.0,-1.0,-3.0,36.0,35.0,36.0,36.0,39.0


In [10]:
le2.classes_

array(['YSDG10-1', 'YSDGHFC0053', 'YSDGHFC0054', 'YSDGHFC0069',
       'YSDGRFOG0065', 'YSHS0011', 'YSHS0013', 'YSHS0014', 'YSHS0015',
       'YSHS0024', 'YSHS0035', 'YSHS0036', 'YSHS0037', 'YSHS0038',
       'YSHS0039', 'YSHS0042', 'YSHS0043', 'YSHS0045', 'YSHS0046',
       'YSHS0047', 'YSHS0051', 'YSHS0053', 'YSHS0054', 'YSHS0055',
       'YSHS0058', 'YSHS0065', 'YSHS0069-1', 'YSHS0069-2', 'YSHS0070',
       'YSHS0075', 'YSHS0079', 'YSHS0083', 'YSHS0092', 'YSHSH5005',
       'YSHSHFC0007', 'YSHSHFC0007-2', 'YSHSHFC0008', 'YSHSHFC0009',
       'YSJB0075', 'YSJB1-9', 'YSJB5-25', 'YSJB5-27', 'YSJBF3-2',
       'YSJBG6-1F', 'YSJBH910', 'YSJSC2003', 'YSJSC2003-1', 'YSJSC2004',
       'YSJSC3004', 'YSJSC3010', 'YSJSC4007', 'YSJSC5001', 'YSMMH4004',
       'YSMMH5006', 'YSMMH6003', 'YSMYRFOG0038', 'YSSB1-11', 'YSSB1-6',
       'YSSB2-15', 'YSSB2-4', 'YSSB2-7', 'YSWS0049', 'YSWS0051',
       'YSWS0087', 'YSWS0154', 'YSWS0156', 'YSWS0189', 'YSWS0217',
       'YSWS0217-2', 'YSWS0225', 'YSWS022

In [18]:
data_with_lags.to_csv("data_with_lags.csv", index=0, encoding='cp949')

In [15]:
filtered_data = data_with_lags[(data_with_lags['측정시간'] >= '2024-04-15 00:00:00') & (data_with_lags['측정시간'] < '2024-04-16 00:00:00')]
print(filtered_data['장애여부'].value_counts())
filtered_data

장애여부
0    7063
1      10
Name: count, dtype: int64


Unnamed: 0,측정시간,상향파워2,상향SNR,하향파워,하향SNR,셀번호,장애여부,일,시,분,...,하향파워_lag1,하향파워_lag2,하향파워_lag3,하향파워_lag4,하향파워_lag5,하향SNR_lag1,하향SNR_lag2,하향SNR_lag3,하향SNR_lag4,하향SNR_lag5
124989,2024-04-15 00:15:00,41.0,34.0,2.0,34.0,25,0,15,0,15,...,1.0,-10.0,-7.0,1.0,3.0,35.0,33.0,35.0,34.0,34.0
124990,2024-04-15 00:15:00,31.0,29.0,4.0,30.0,2,0,15,0,15,...,7.0,4.0,0.0,3.0,-2.0,34.0,38.0,32.0,38.0,31.0
124991,2024-04-15 00:15:00,42.0,31.0,12.0,43.0,77,0,15,0,15,...,11.0,12.0,11.0,11.0,9.0,42.0,43.0,42.0,43.0,42.0
124992,2024-04-15 00:15:00,44.0,34.0,1.0,39.0,43,0,15,0,15,...,3.0,1.0,1.0,7.0,3.0,37.0,29.0,39.0,40.0,34.0
124993,2024-04-15 00:15:00,37.0,31.0,-6.0,33.0,84,0,15,0,15,...,-6.0,-6.0,-6.0,-6.0,-7.0,40.0,34.0,34.0,34.0,40.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
132057,2024-04-15 23:05:00,32.0,23.0,6.0,39.0,18,0,15,23,5,...,5.0,-4.0,0.0,5.0,2.0,39.0,36.0,39.0,38.0,39.0
132058,2024-04-15 23:05:00,45.0,39.0,-2.0,37.0,50,0,15,23,5,...,-2.0,-2.0,-2.0,2.0,1.0,35.0,37.0,35.0,29.0,38.0
132059,2024-04-15 23:05:00,40.0,28.0,1.0,30.0,2,0,15,23,5,...,7.0,4.0,6.0,11.0,4.0,38.0,39.0,36.0,38.0,38.0
132060,2024-04-15 23:05:00,41.0,36.0,-5.0,34.0,25,0,15,23,5,...,-6.0,2.0,-4.0,-6.0,2.0,35.0,34.0,36.0,34.0,34.0


In [16]:
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc, confusion_matrix
from tqdm.auto import tqdm

X = filtered_data.drop(columns=['장애여부', '측정시간'])
y = filtered_data['장애여부']

model = XGBClassifier(learning_rate=0.1, n_estimators=1000)
model.load_model("xgb_model.json")
y_proba = model.predict_proba(X)[:, 1]

TH = 0.033

all_data = X.copy()
all_data['측정시간'] = filtered_data['측정시간']
all_data['실제장애여부'] = y
all_data['장애여부'] = (y_proba >= TH).astype(int)
all_data['장애확률'] = y_proba
all_data.sort_values('장애확률', ascending=False)[:50]

Unnamed: 0,상향파워2,상향SNR,하향파워,하향SNR,셀번호,일,시,분,요일,상향파워2_lag1,...,하향파워_lag5,하향SNR_lag1,하향SNR_lag2,하향SNR_lag3,하향SNR_lag4,하향SNR_lag5,측정시간,실제장애여부,장애여부,장애확률
126663,51.0,25.0,-7.0,40.0,83,15,6,5,0,35.0,...,-7.0,41.0,42.0,44.0,41.0,41.0,2024-04-15 06:05:00,1,1,0.436118
127284,51.0,33.0,-2.0,43.0,75,15,8,15,0,57.0,...,-8.0,44.0,42.0,42.0,44.0,36.0,2024-04-15 08:15:00,1,1,0.396137
131653,53.0,36.0,-1.0,40.0,0,15,22,5,0,0.0,...,10.0,0.0,0.0,0.0,41.0,40.0,2024-04-15 22:05:00,1,1,0.388888
129467,57.0,30.0,2.0,43.0,75,15,14,5,0,57.0,...,6.0,43.0,43.0,41.0,42.0,41.0,2024-04-15 14:05:00,1,1,0.339118
129227,54.0,35.0,0.0,43.0,75,15,13,15,0,52.0,...,0.0,41.0,45.0,40.0,41.0,43.0,2024-04-15 13:15:00,0,1,0.180122
125308,51.0,35.0,2.0,43.0,75,15,1,5,0,54.0,...,0.0,40.0,43.0,39.0,44.0,41.0,2024-04-15 01:05:00,0,1,0.163571
125289,54.0,35.0,0.0,41.0,75,15,1,5,0,50.0,...,-1.0,43.0,43.0,40.0,43.0,45.0,2024-04-15 01:05:00,0,1,0.146383
129491,51.0,33.0,2.0,43.0,75,15,14,15,0,57.0,...,2.0,43.0,43.0,43.0,41.0,42.0,2024-04-15 14:15:00,0,1,0.124913
129306,54.0,36.0,2.0,42.0,75,15,13,25,0,54.0,...,9.0,41.0,45.0,42.0,41.0,42.0,2024-04-15 13:25:00,0,1,0.093573
129314,54.0,35.0,0.0,43.0,75,15,13,25,0,54.0,...,3.0,41.0,42.0,41.0,45.0,42.0,2024-04-15 13:25:00,1,1,0.090556
