In [37]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# 데이터 불러오기
data = pd.read_csv("avail_data.csv", parse_dates=['측정시간'], encoding='cp949').sort_values('측정시간').reset_index(drop=True)
data

Unnamed: 0,측정시간,온오프라인여부,상향파워2,상향SNR,하향파워,하향SNR,셀번호,total_num,cum_num,cum_rate
0,2024-04-01 00:20:00,onlie,48.666667,31.333333,-2.333333,38.666667,YSHS0079,1,1,1.000000
1,2024-04-01 00:20:00,onlie,44.000000,25.000000,0.000000,37.500000,YSHSHFC0008,1,1,1.000000
2,2024-04-01 00:20:00,onlie,40.500000,31.000000,7.000000,42.500000,YSWS0154,1,1,1.000000
3,2024-04-01 00:20:00,onlie,43.000000,30.400000,-0.200000,36.200000,YSYW0063B,1,1,1.000000
4,2024-04-01 00:20:00,onlie,43.000000,35.000000,-1.000000,40.000000,YSHS0035,1,0,0.000000
...,...,...,...,...,...,...,...,...,...,...
95740,2024-05-01 23:00:00,onlie,32.000000,27.000000,-10.000000,39.500000,YSJSC2004,2405,1380,0.573805
95741,2024-05-01 23:00:00,onlie,35.000000,32.000000,13.000000,38.000000,YSHS0013,1329,341,0.256584
95742,2024-05-01 23:00:00,onlie,50.000000,31.000000,-3.000000,37.000000,YSSB2-7,1434,341,0.237796
95743,2024-05-01 23:00:00,onlie,43.750000,20.500000,5.250000,42.500000,YSWS0217,2177,1084,0.497933


In [38]:
# 데이터 불러오기 2
outlier = pd.read_csv("장애내역_수정.csv", encoding='cp949').drop_duplicates().reset_index(drop=True)

# 장애내역 데이터의 측정시간은 settop과 다르게 뒤에 +09:00가 붙어있음 -> 삭제
outlier['측정시간'] = outlier['측정시간'].str.replace("+09:00", "")

# 측정시간 열 날짜형식으로 변경
outlier['측정시간'] = pd.to_datetime(outlier['측정시간'])
outlier = outlier.drop(columns = ['index'])
outlier

Unnamed: 0,셀번호,측정시간,장애여부
0,YSWS4-5,2024-04-01 10:34:43,MAJOR
1,YSSB1-6,2024-04-01 11:13:33,MAJOR
2,YSWS0244,2024-04-01 15:22:45,CRITICAL
3,YSWSG3-4B,2024-04-02 10:54:25,MAJOR
4,YSJB5-25,2024-04-02 13:41:30,MAJOR
...,...,...,...
181,YSYW0030,2024-04-30 09:33:27,MAJOR
182,YSYW0030,2024-04-30 10:01:47,CRITICAL
183,YSWS8-5,2024-04-30 14:31:26,MAJOR
184,YSYW0009,2024-04-30 14:44:35,MAJOR


In [39]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# 장애내역 데이터는 시간단위가 초 단위로 측정이 되었지만 settop 데이터는 5분단위임 따라서 settop데이터에서 가장 가까운 앞쪽 시간을 찾아 장애 발생지점으로 선정

def find_nearest_time(df, target_time, cell_number):

    # 해당 셀번호가 df안에 존재하는 경우
    if cell_number in df['셀번호'].unique():

        # 장애발생 시간보다 나중 시간인 데이터 제외
        temp_df = df[df['측정시간'] <= target_time]

        # 그 중 셀번호가 동일한 경우만 남김
        temp_df = temp_df[temp_df['셀번호'] == cell_number]
        
        # 조건에 맞는 행이 존재하는 경우 즉 행의 개수가 0이 아닌 경우
        if len(temp_df) != 0:
            # 가장 가까운 시간을 가져와야함 이 때 df는 측정시간 기준 정렬이 되어 있으므로 가장 마지막 행을 가져옴
            return temp_df.iloc[-1]

merged = []

# 장애 발생 df에 대하여 모든 행에 수행
for _, row in outlier.iterrows():
    nearest_row = find_nearest_time(data, row['측정시간'], row['셀번호'])

    if nearest_row is not None:

        # 장애여부 열을 생성하여 1 지정
        nearest_row['장애여부'] = 1

        # 조건에 부합하는 행 merged에 저장
        merged.append(nearest_row)

# merged 리스트에 저장된 데이터 df 화
merged_df = pd.DataFrame(merged)
merged_df = merged_df.drop_duplicates()
merged_df

Unnamed: 0,측정시간,온오프라인여부,상향파워2,상향SNR,하향파워,하향SNR,셀번호,total_num,cum_num,cum_rate,장애여부
785,2024-04-01 09:00:00,onlie,42.000000,30.000000,-4.000000,39.000000,YSWS4-5,4,0,0.000000,1
1018,2024-04-01 10:10:00,onlie,33.000000,31.000000,-6.000000,35.000000,YSSB1-6,5,0,0.000000,1
4371,2024-04-02 10:50:00,onlie,35.000000,33.000000,6.000000,42.000000,YSWSG3-4B,40,29,0.725000,1
4335,2024-04-02 10:30:00,onlie,30.000000,34.000000,-13.000000,34.500000,YSJB5-25,51,26,0.509804,1
7631,2024-04-03 08:50:00,onlie,40.000000,30.000000,9.000000,42.000000,YSJSC2003,88,14,0.159091,1
...,...,...,...,...,...,...,...,...,...,...,...
91742,2024-04-30 09:00:00,onlie,54.000000,26.000000,16.000000,41.000000,YSYW0030,795,260,0.327044,1
91843,2024-04-30 10:00:00,onlie,55.000000,30.000000,0.000000,39.000000,YSYW0030,798,260,0.325815,1
85052,2024-04-27 21:45:00,onlie,48.000000,31.000000,-4.000000,39.000000,YSWS8-5,20,0,0.000000,1
92329,2024-04-30 14:35:00,onlie,42.888889,29.000000,-2.888889,36.666667,YSYW0009,2310,1133,0.490476,1


In [40]:
# 장애발생을 1 , 정상을 0으로 지정
data['장애여부'] = 0

# merged_df에 존재하는 데이터는 장애 발생 데이터 이므로 이와 동일한 인덱스를 가진 data에는 장애여부를 1로 설정
# merged_df는 data에서 가져온 행이기 때문에 인덱스가 동일함
data['장애여부'].loc[merged_df.index] = 1

In [41]:
# 측정시간 열을 일, 시, 분, 요일로 나눔 이 때 연도는 차이가 없기 때문에 컬럼에 따로 넣지 않음
# 5월 데이터가 있지만, 5월 1일만 존재하여 month 추가 하지 않음

data['일'] = data['측정시간'].dt.day
data['시'] = data['측정시간'].dt.hour
data['분'] = data['측정시간'].dt.minute
data['요일'] = data['측정시간'].dt.weekday
data

Unnamed: 0,측정시간,온오프라인여부,상향파워2,상향SNR,하향파워,하향SNR,셀번호,total_num,cum_num,cum_rate,장애여부,일,시,분,요일
0,2024-04-01 00:20:00,onlie,48.666667,31.333333,-2.333333,38.666667,YSHS0079,1,1,1.000000,0,1,0,20,0
1,2024-04-01 00:20:00,onlie,44.000000,25.000000,0.000000,37.500000,YSHSHFC0008,1,1,1.000000,0,1,0,20,0
2,2024-04-01 00:20:00,onlie,40.500000,31.000000,7.000000,42.500000,YSWS0154,1,1,1.000000,0,1,0,20,0
3,2024-04-01 00:20:00,onlie,43.000000,30.400000,-0.200000,36.200000,YSYW0063B,1,1,1.000000,0,1,0,20,0
4,2024-04-01 00:20:00,onlie,43.000000,35.000000,-1.000000,40.000000,YSHS0035,1,0,0.000000,0,1,0,20,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95740,2024-05-01 23:00:00,onlie,32.000000,27.000000,-10.000000,39.500000,YSJSC2004,2405,1380,0.573805,0,1,23,0,2
95741,2024-05-01 23:00:00,onlie,35.000000,32.000000,13.000000,38.000000,YSHS0013,1329,341,0.256584,0,1,23,0,2
95742,2024-05-01 23:00:00,onlie,50.000000,31.000000,-3.000000,37.000000,YSSB2-7,1434,341,0.237796,0,1,23,0,2
95743,2024-05-01 23:00:00,onlie,43.750000,20.500000,5.250000,42.500000,YSWS0217,2177,1084,0.497933,0,1,23,0,2


### 온오프라인여부, 셀번호 미상용으로 인한 주석처리 ###
미사용 이유

1. 온오프라인
<br>기존 온오프라인 사용여부 열의 여러 값들(w-online 등등)을 모두 online으로 봐도 무방하다는 가이드라인이 존재했음
<br>전처리 시 on/off 이분화를 진행함
<br>하지만 명칭이 다르기 때문에 100% 일치하지는 않는 값임
<br>피쳐 포함, 미포함 분석을 해보았는데 결과는 비슷하게 나와 분석에서 제외하기로 결정

2. 셀번호
<br>장애가 있는 셋탑박스의 데이터만 사용하여 모델을 학습
<br>하지만 실제 장애 예측은 전체 셋탑박스에 대해 이루어져야 함 즉, 모델이 장애가 있는 셋탑박스뿐만 아니라 장애가 없는 셋탑박스도 예측할 수 있어야 함
<br>셀번호를 모델 학습에 포함시키면 오류가 발생할 수 있음
<br>또한 셀번호는 각 셋탑박스를 구분하는 고유 식별자이므로, 특정 셋탑박스에 대한 학습 데이터가 모델에 과도하게 맞춰지는(과적합) 현상을 일으킬 수 있음
<br>마찬가지로 피쳐 포함, 미포함 분석을 해보았는데 결과는 비슷하게 나와 분석에서 제외하기로 결정

In [42]:
# from sklearn.preprocessing import LabelEncoder
# # labelencoder를 통해 문자형 변수 온오프라인여부 int로 치환

# le1 = LabelEncoder()
# le2 = LabelEncoder()

# data['온오프라인여부'] = le1.fit_transform(data['온오프라인여부'])
# data['셀번호'] = le2.fit_transform(data['셀번호'])

# cum_rate만 사용하기 때문에 total_num과 cum_num 삭제
data = data.drop(columns=['total_num'])
data = data.drop(columns=['cum_num'])
data = data.drop(columns=['온오프라인여부'])

data

Unnamed: 0,측정시간,상향파워2,상향SNR,하향파워,하향SNR,셀번호,cum_rate,장애여부,일,시,분,요일
0,2024-04-01 00:20:00,48.666667,31.333333,-2.333333,38.666667,YSHS0079,1.000000,0,1,0,20,0
1,2024-04-01 00:20:00,44.000000,25.000000,0.000000,37.500000,YSHSHFC0008,1.000000,0,1,0,20,0
2,2024-04-01 00:20:00,40.500000,31.000000,7.000000,42.500000,YSWS0154,1.000000,0,1,0,20,0
3,2024-04-01 00:20:00,43.000000,30.400000,-0.200000,36.200000,YSYW0063B,1.000000,0,1,0,20,0
4,2024-04-01 00:20:00,43.000000,35.000000,-1.000000,40.000000,YSHS0035,0.000000,0,1,0,20,0
...,...,...,...,...,...,...,...,...,...,...,...,...
95740,2024-05-01 23:00:00,32.000000,27.000000,-10.000000,39.500000,YSJSC2004,0.573805,0,1,23,0,2
95741,2024-05-01 23:00:00,35.000000,32.000000,13.000000,38.000000,YSHS0013,0.256584,0,1,23,0,2
95742,2024-05-01 23:00:00,50.000000,31.000000,-3.000000,37.000000,YSSB2-7,0.237796,0,1,23,0,2
95743,2024-05-01 23:00:00,43.750000,20.500000,5.250000,42.500000,YSWS0217,0.497933,0,1,23,0,2


In [43]:
# 특정 시간기준 이전 시점의 데이터와 비교하는 lag 추가
# t시점 데이터에서 lag가 1인 경우 t-1 시점의 데이터
def add_lag_features(df, group_col, target_cols, lag_num):
    temp_df = df.copy()

    # 원하는 열에 대해 모두 수행하기 위해 반복문 추가
    for col in target_cols:
        # 원하는 lag의 수 만큼 수행하기 위해 반복문 추가
        for lag in range(1, lag_num + 1):
            # 원하는 열의 기준값에서 lag만큼 이동한 lag 피쳐 생성하고 df에 추가
            temp_df[f'{col}_lag{lag}'] = temp_df.groupby(group_col)[col].shift(lag)
    return temp_df

target_cols = ['상향파워2', '상향SNR', '하향파워', '하향SNR']

lag_num=5

data_with_lags = add_lag_features(data, '셀번호', target_cols, lag_num)

data_with_lags = data_with_lags.dropna().reset_index(drop=True)
data_with_lags

Unnamed: 0,측정시간,상향파워2,상향SNR,하향파워,하향SNR,셀번호,cum_rate,장애여부,일,시,...,하향파워_lag1,하향파워_lag2,하향파워_lag3,하향파워_lag4,하향파워_lag5,하향SNR_lag1,하향SNR_lag2,하향SNR_lag3,하향SNR_lag4,하향SNR_lag5
0,2024-04-01 01:30:00,43.500000,30.500000,5.500000,42.000000,YSWS0217,0.500000,0,1,1,...,10.000000,10.000000,13.000000,-3.000000,0.666667,41.000000,41.000000,45.000000,41.000000,42.333333
1,2024-04-01 01:30:00,43.000000,35.000000,2.000000,32.000000,YSJBG6-1F,0.666667,0,1,1,...,2.333333,1.000000,2.333333,-2.000000,-1.250000,35.333333,39.000000,34.333333,34.000000,36.000000
2,2024-04-01 01:30:00,53.000000,39.000000,-6.000000,32.000000,YSJSC4007,0.333333,0,1,1,...,-6.000000,3.000000,2.000000,-3.000000,-3.000000,31.000000,38.000000,38.000000,35.500000,35.500000
3,2024-04-01 01:30:00,41.454545,28.818182,-0.272727,37.181818,YSDGHFC0054,1.000000,0,1,1,...,-1.400000,-1.846154,-2.466667,-2.000000,-1.727273,36.866667,36.615385,36.200000,36.700000,36.636364
4,2024-04-01 01:30:00,46.000000,29.333333,-8.000000,33.666667,YSHS0065,1.000000,0,1,1,...,2.000000,-4.333333,-1.000000,0.666667,1.200000,34.500000,34.000000,34.833333,35.333333,34.800000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95248,2024-05-01 23:00:00,32.000000,27.000000,-10.000000,39.500000,YSJSC2004,0.573805,0,1,23,...,-3.000000,1.000000,6.000000,6.000000,1.000000,38.000000,39.909091,41.000000,40.000000,41.000000
95249,2024-05-01 23:00:00,35.000000,32.000000,13.000000,38.000000,YSHS0013,0.256584,0,1,23,...,9.500000,13.000000,12.000000,12.000000,12.000000,37.000000,38.000000,38.000000,38.000000,38.000000
95250,2024-05-01 23:00:00,50.000000,31.000000,-3.000000,37.000000,YSSB2-7,0.237796,0,1,23,...,-1.000000,-2.000000,-1.000000,0.000000,-12.000000,35.000000,34.000000,35.000000,35.000000,36.000000
95251,2024-05-01 23:00:00,43.750000,20.500000,5.250000,42.500000,YSWS0217,0.497933,0,1,23,...,8.000000,7.000000,2.000000,1.000000,6.000000,43.000000,43.000000,43.000000,42.000000,43.000000


In [44]:
data_with_lags.to_csv("data_with_lags.csv", index=0, encoding='cp949')

In [45]:
# 모델로 확인할 날짜 지정
# 18일에 장애가 가장 많이 발생하여 18일의 데이터로 모델 학습결과 확인
filtered_data = data_with_lags[(data_with_lags['측정시간'] >= '2024-04-18 00:00:00') & (data_with_lags['측정시간'] < '2024-04-19 00:00:00')]
print(filtered_data['장애여부'].value_counts())
filtered_data

장애여부
0    2323
1      34
Name: count, dtype: int64


Unnamed: 0,측정시간,상향파워2,상향SNR,하향파워,하향SNR,셀번호,cum_rate,장애여부,일,시,...,하향파워_lag1,하향파워_lag2,하향파워_lag3,하향파워_lag4,하향파워_lag5,하향SNR_lag1,하향SNR_lag2,하향SNR_lag3,하향SNR_lag4,하향SNR_lag5
58108,2024-04-18 00:15:00,29.00,32.0,11.00,39.00,YSJSC3010,0.414226,0,18,0,...,11.000000,11.0,2.000000,0.000000,-4.000000,39.000000,39.0,41.000000,40.000000,37.500000
58109,2024-04-18 00:15:00,43.00,33.0,1.00,32.00,YSJBG6-1F,0.899796,0,18,0,...,1.666667,2.0,1.666667,2.000000,1.666667,35.333333,35.5,36.666667,37.500000,37.666667
58110,2024-04-18 00:15:00,34.75,27.0,1.75,38.25,YSHS0046,0.590677,0,18,0,...,-5.000000,0.5,2.500000,0.333333,0.333333,37.000000,38.0,38.500000,38.333333,38.333333
58111,2024-04-18 00:15:00,29.00,22.0,6.00,35.00,YSJB5-25,0.270073,0,18,0,...,7.000000,7.0,6.000000,6.000000,6.000000,35.000000,36.0,35.000000,35.000000,35.000000
58112,2024-04-18 00:15:00,44.00,32.0,5.00,36.00,YSHS0038,0.223022,0,18,0,...,2.000000,2.0,6.000000,6.000000,12.333333,38.000000,38.0,37.000000,37.000000,39.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60460,2024-04-18 23:05:00,52.00,32.0,2.00,39.00,YSMYRFOG0038,0.179310,0,18,23,...,2.000000,2.0,2.000000,2.000000,3.000000,38.000000,38.0,38.000000,38.000000,37.000000
60461,2024-04-18 23:05:00,54.00,29.0,3.00,39.00,YSHS0070,0.790646,0,18,23,...,-2.000000,-3.0,-2.000000,7.000000,-5.000000,38.000000,38.0,38.000000,38.000000,38.000000
60462,2024-04-18 23:05:00,40.00,36.0,1.00,38.00,YSJSC4007,0.379759,0,18,23,...,1.000000,4.0,-4.000000,-5.000000,-3.000000,38.000000,40.0,37.000000,37.000000,36.000000
60463,2024-04-18 23:05:00,38.00,30.0,-1.00,36.00,YSWS0265,0.310127,0,18,23,...,-1.000000,-6.0,-1.000000,-6.000000,-7.000000,36.000000,39.0,35.000000,39.000000,33.000000


In [46]:
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc, confusion_matrix
from tqdm.auto import tqdm

# 셀번호는 학습에 사용하지 않아서 삭제, 측정시간은 일, 시, 분, 요일로 나누어서 삭제
X = filtered_data.drop(columns=['장애여부', '측정시간', '셀번호'])
y = filtered_data['장애여부']

# 학습한 모델을 불러와서 확인
model = XGBClassifier(learning_rate=0.1, n_estimators=1000)
model.load_model("xgb_model.json")

# 장애확률 불러오기
y_proba = model.predict_proba(X)[:, 1]

# 임계값 지정
TH = 0.033

# 모델을 사용하기 위해 삭제한 측정시간과 셀번호를 다시 불러옴
all_data = X.copy()
all_data['측정시간'] = filtered_data['측정시간']
all_data['셀번호'] = filtered_data['셀번호']

# 실제 값 추가
all_data['실제장애여부'] = y

# 장애확률이 임계값보다 큰 경우 1로 설정
all_data['장애여부'] = (y_proba >= TH).astype(int)
all_data['장애확률'] = y_proba

# 장애확률이 높은 순서로 정렬
all_data.sort_values('장애확률', ascending=False)[:50]

Unnamed: 0,상향파워2,상향SNR,하향파워,하향SNR,cum_rate,일,시,분,요일,상향파워2_lag1,...,하향SNR_lag1,하향SNR_lag2,하향SNR_lag3,하향SNR_lag4,하향SNR_lag5,측정시간,셀번호,실제장애여부,장애여부,장애확률
58439,44.833333,35.666667,0.5,40.333333,0.037037,18,2,15,3,40.0,...,38.0,41.0,40.0,41.0,41.0,2024-04-18 02:15:00,YSHS0092,1,1,0.639631
58433,51.0,36.0,3.0,35.0,0.010309,18,2,15,3,51.0,...,35.0,35.0,35.0,35.0,35.0,2024-04-18 02:15:00,YSHS0036,1,1,0.565448
58465,34.5,34.0,6.5,38.5,0.072727,18,2,25,3,37.0,...,38.0,40.0,38.0,38.0,39.0,2024-04-18 02:25:00,YSHS0053,1,1,0.525027
58447,42.0,36.0,3.0,39.0,0.061927,18,2,15,3,43.0,...,40.0,40.0,40.0,39.0,40.0,2024-04-18 02:15:00,YSHS0055,1,1,0.48881
58510,39.5,29.833333,-8.0,29.0,0.637097,18,2,45,3,42.0,...,40.0,39.0,39.0,38.0,39.0,2024-04-18 02:45:00,YSHS0079,1,1,0.457402
58673,38.0,36.0,-3.0,35.0,0.0,18,3,45,3,38.0,...,34.0,35.0,34.0,35.0,35.0,2024-04-18 03:45:00,YSHS0024,1,1,0.447215
58441,37.25,33.75,11.75,39.5,0.22315,18,2,15,3,51.0,...,38.0,36.0,37.0,36.0,38.0,2024-04-18 02:15:00,YSHS0038,1,1,0.439895
58474,35.5,35.0,5.5,39.5,0.312312,18,2,25,3,34.0,...,40.0,40.0,39.5,38.0,38.0,2024-04-18 02:25:00,YSHS0083,1,1,0.409625
58457,38.0,34.0,-4.0,34.0,0.0,18,2,25,3,39.0,...,35.0,35.0,34.0,35.0,35.0,2024-04-18 02:25:00,YSHS0024,1,1,0.395315
58411,50.0,35.0,1.0,37.0,0.352941,18,2,5,3,48.5,...,40.5,38.0,35.0,37.0,39.0,2024-04-18 02:05:00,YSHS0014,1,1,0.287467
