In [82]:
import os
import json
import datetime
import logging
import warnings

import pandas as pd
import numpy as np
from tqdm import tqdm
import cudf
from numba import cuda
from collections import Counter
import cupy as cp


import config as CONFIG
import utils

warnings.filterwarnings("ignore")
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

# load data

In [4]:
mcelog = pd.read_csv(os.path.join(CONFIG.PATH_RAW, "ali/mcelog.csv"))
failure = pd.read_csv(os.path.join(CONFIG.PATH_RAW, "ali/trouble_tickets.csv"))
inventory = pd.read_csv(os.path.join(CONFIG.PATH_RAW, "ali/inventory.csv"))

In [70]:
mcelog.head(5)

NameError: name 'mcelog' is not defined

In [6]:
def format_date(date):
    y = int(date.split(" ")[0].split("-")[0])
    m = int(date.split(" ")[0].split("-")[1])
    d = date.split(" ")[0].split("-")[2]

    yy = "2020"
    if m <= 3:
        yy = "2019"
    m = (m + 9) % 12
    m = 12 if m == 0 else m
    mm = "0" + str(m) if m <= 9 else str(m)

    return f"{yy}-{mm}-{d} {date.split(' ')[1]}"

In [7]:
mcelog["error_time"] = mcelog["error_time"].apply(format_date)
failure["failed_time"] = failure["failed_time"].apply(format_date)

In [11]:
mcelog.head(5)

Unnamed: 0,sid,memoryid,rankid,bankid,row,col,error_type,error_time
0,Server_1,10,1,13,3334,808,2,2020-03-14 06:40:59
1,Server_1,10,1,13,3334,808,2,2020-03-19 04:07:39
2,Server_1,10,1,13,3334,808,2,2020-03-26 00:32:58
3,Server_1,10,1,13,3334,808,2,2020-04-06 18:24:57
4,Server_1,10,1,13,3334,808,2,2020-04-07 17:54:17


In [8]:
failure.head(5)

Unnamed: 0,sid,failure_type,failed_time
0,Server_10003,2,2020-04-17 11:11:07
1,Server_10034,3,2020-05-11 07:35:07
2,Server_10074,3,2020-05-11 07:37:47
3,Server_10108,2,2020-04-13 03:51:43
4,Server_10152,2,2020-03-14 10:23:07


# process

## split to deal, reduce memory

In [9]:
sid_list = mcelog["sid"].drop_duplicates().to_list()
print(len(sid_list))
chunk_size = len(sid_list) // 8 
chunk_list = [sid_list[i * chunk_size : (i + 1) * chunk_size] for i in range(9)]

chunk_one = chunk_list[0]

mcelog_one = mcelog[mcelog["sid"].isin(chunk_one)]

30502


In [10]:
mcelog_one.head()

Unnamed: 0,sid,memoryid,rankid,bankid,row,col,error_type,error_time
0,Server_1,10,1,13,3334,808,2,2020-03-14 06:40:59
1,Server_1,10,1,13,3334,808,2,2020-03-19 04:07:39
2,Server_1,10,1,13,3334,808,2,2020-03-26 00:32:58
3,Server_1,10,1,13,3334,808,2,2020-04-06 18:24:57
4,Server_1,10,1,13,3334,808,2,2020-04-07 17:54:17


In [11]:
len(mcelog_one)

2001085

In [18]:
mcelog_one.to_csv(os.path.join(CONFIG.PATH_PROCESSED, "mcelog_one.csv"))

# read one

In [12]:
mcelog_one = cudf.read_csv(os.path.join(CONFIG.PATH_PROCESSED, "mcelog_one.csv"))

In [16]:
inventory = cudf.read_csv(os.path.join(CONFIG.PATH_RAW, "ali/inventory.csv"))

In [25]:
failure = cudf.read_csv(os.path.join(CONFIG.PATH_RAW, "ali/trouble_tickets.csv"))

## merge

In [14]:
mcelog_one = mcelog_one.sort_values(["sid", "error_time"])

In [22]:
features = mcelog_one

features = cudf.merge(features, inventory, on=["sid"], how="left")

In [34]:
features

Unnamed: 0.1,Unnamed: 0,sid,memoryid,rankid,bankid,row,col,error_type,error_time,server_manufacturer,DRAM_model,DIMM_number
0,519626,Server_1003,22,1,14,91060,424,2,2019-11-15 01:04:16,M2,C1,12
1,519627,Server_1003,22,1,15,51314,464,2,2019-11-15 17:31:27,M2,C1,12
2,519628,Server_1003,22,1,14,91060,424,2,2019-11-16 00:50:31,M2,C1,12
3,519629,Server_1003,22,1,14,91060,304,2,2019-11-16 00:50:31,M2,C1,12
4,519630,Server_1003,22,1,14,91060,288,2,2019-11-16 00:50:31,M2,C1,12
...,...,...,...,...,...,...,...,...,...,...,...,...
2001080,519261,Server_998,0,1,9,61045,8,2,2020-02-08 15:30:21,M2,C1,12
2001081,519262,Server_999,20,1,4,65426,1008,1,2020-03-20 21:38:54,M2,C1,12
2001082,519263,Server_999,20,1,4,65426,1008,1,2020-04-10 13:51:22,M2,C1,12
2001083,519264,Server_999,20,1,4,65426,1008,1,2020-04-12 01:03:30,M2,C1,12


In [26]:
failure_ue = failure[failure["failure_type"]==1]

In [27]:
failure_ue

Unnamed: 0,sid,failure_type,failed_time
8,Server_10250,1,0001-01-14 18:16:39
9,Server_10286,1,0001-01-10 05:13:34
17,Server_10508,1,0001-06-24 05:03:00
20,Server_10519,1,0001-06-05 23:35:14
21,Server_10521,1,0001-08-30 22:47:44
...,...,...,...
2994,Server_957,1,0001-05-13 07:38:59
2996,Server_9597,1,0001-07-08 10:13:56
2997,Server_9620,1,0001-04-13 09:37:41
3007,Server_9825,1,0001-02-13 16:58:03


In [28]:
features_tag = cudf.merge(features, failure_ue, on=["sid"], how="left")
features_tag["failed_time"].fillna("2023-01-01 00:00:00", inplace=True)

In [30]:
features_tag.head(5)

Unnamed: 0.1,Unnamed: 0,sid,memoryid,rankid,bankid,row,col,error_type,error_time,server_manufacturer,DRAM_model,DIMM_number,failure_type,failed_time
0,530325,Server_1036,4,1,12,37362,984,1,2020-02-27 13:21:47,M2,A1,12,,2023-01-01 00:00:00
1,530326,Server_1036,4,1,12,37362,984,1,2020-02-27 13:24:01,M2,A1,12,,2023-01-01 00:00:00
2,530327,Server_1036,4,1,12,37362,984,1,2020-02-27 13:26:47,M2,A1,12,,2023-01-01 00:00:00
3,530328,Server_1036,4,1,12,37362,984,1,2020-02-27 13:31:47,M2,A1,12,,2023-01-01 00:00:00
4,530329,Server_1036,4,1,12,37362,984,1,2020-02-27 13:36:47,M2,A1,12,,2023-01-01 00:00:00


In [32]:
features_tag["failure_gap"] = (cudf.to_datetime(features_tag["failed_time"]) - cudf.to_datetime(features_tag["error_time"])).dt.days

In [63]:
features_tag.head()

Unnamed: 0.1,Unnamed: 0,sid,memoryid,rankid,bankid,row,col,error_type,error_time,server_manufacturer,DRAM_model,DIMM_number,failure_type,failed_time,failure_gap
0,530325,Server_1036,4,1,12,37362,984,1,2020-02-27 13:21:47,M2,A1,12,,2023-01-01 00:00:00,1038
1,530326,Server_1036,4,1,12,37362,984,1,2020-02-27 13:24:01,M2,A1,12,,2023-01-01 00:00:00,1038
2,530327,Server_1036,4,1,12,37362,984,1,2020-02-27 13:26:47,M2,A1,12,,2023-01-01 00:00:00,1038
3,530328,Server_1036,4,1,12,37362,984,1,2020-02-27 13:31:47,M2,A1,12,,2023-01-01 00:00:00,1038
4,530329,Server_1036,4,1,12,37362,984,1,2020-02-27 13:36:47,M2,A1,12,,2023-01-01 00:00:00,1038


In [37]:
observation_window = 5 # days, 120h
prediction_window = 30 # days

row_threshold = 5
row_oberservation_window = 168 

col_threshold = 5
col_oberservation_window = 168 

In [156]:

features_tag_df = features_tag.to_pandas()

column = "row"

def most_common_value(arr):
    counter = Counter(arr)
    count_above_threshold = sum(1 for value_count in counter.values() if value_count > threshold)


def update_new_column(column_list, new_column_list):
    most_common_value(column_list)
    for i in range(1, len(column_list)):
        new_column_list[i] = most_common_value(column_list[:i+1])
    

for host in tqdm(hosts):
    column_list = features_tag_df[features_tag_df["sid"]==host][column].to_list()
    new_column_list = [0] * len(column_list)
    update_new_column(column_list, new_column_list)



 24%|████████████████████▍                                                                | 918/3812 [17:01<53:38,  1.11s/it]


KeyboardInterrupt: 

In [72]:
import pandas as pd

# 创建示例 DataFrame
data = {'A': [1, 2, 1, 3, 2, 1, 1, 3, 2, 3, 3, 3, 3, 3]}
df = pd.DataFrame(data)

# 自定义函数，用于计算出现次数最多的值
def most_common_value(arr):
    counts = arr.value_counts()
    if counts.empty:
        return None
    # return counts.idxmax()
    return (counts>2).sum()

# 使用 expanding() 和 apply() 计算每行之前的数据中出现次数最多的值
df['B'] = df['A'].expanding().apply(most_common_value)

print(df)


    A    B
0   1  0.0
1   2  0.0
2   1  0.0
3   3  0.0
4   2  0.0
5   1  1.0
6   1  1.0
7   3  1.0
8   2  2.0
9   3  3.0
10  3  3.0
11  3  3.0
12  3  3.0
13  3  3.0


In [159]:

column_list = [1,2,3,3,2,1,4,4,3,4,2,2]

a = [0]*len(column_list)
for i in range(0, len(column_list)):
    counter = Counter(column_list[0:i+1])
    count_above_threshold = sum(1 for value_count in counter.values() if value_count >= 3)
    a[i] = count_above_threshold
    
print(a)

[0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 3]


In [161]:
mcelog_1 = pd.read_csv(os.path.join(CONFIG.PATH_PROCESSED, "mcelog_1.csv"))


In [163]:
mcelog_1["sid"].value_counts()

Server_1841    152576
Server_1040     59183
Server_1665     42184
Server_1224     26268
Server_1079     21492
                ...  
Server_1078         1
Server_1539         1
Server_1799         1
Server_1800         1
Server_1537         1
Name: sid, Length: 953, dtype: int64

In [165]:
import multiprocessing
multiprocessing.cpu_count()

16

In [166]:

import pandas as pd

df1 = pd.DataFrame({"A": [1, 2], "B": [3, 4]})
df2 = pd.DataFrame({"A": [1, 2], "B": [3, 4]})
df3 = pd.DataFrame({"A": [1, 2], "B": [3, 4]})

# 假设你有一个包含DataFrame的列表
dataframe_list = [df1, df2, df3]  # 将你的DataFrame放入列表中

# 使用pd.concat()按列合并所有DataFrame
merged_dataframe = pd.concat(dataframe_list, axis=0, ignore_index=True)

# 打印合并后的DataFrame
print(merged_dataframe)


   A  B
0  1  3
1  2  4
2  1  3
3  2  4
4  1  3
5  2  4


In [13]:
import cupy as cp
import numpy as np
import time

t1 = time.time()
# 创建n个随机数据列表
n = 100
data_lists = [np.random.randint(0, 10, size=1500) for _ in range(n)]

# 将数据列表移动到GPU上
data_lists_gpu = [cp.asarray(data) for data in data_lists]

# 初始化字典来存储每个列表中每个值的频率
value_frequencies = [{} for _ in range(n)]

# 计算每个列表中每个值的频率
for i in range(len(data_lists_gpu)):
    unique_values_gpu = cp.unique(data_lists_gpu[0:i+1])
#     unique_values_cpu = cp.asnumpy(unique_values_gpu)
#     frequencies_cpu = cp.asnumpy(frequencies_gpu)
    
#     for value, freq in zip(unique_values_cpu, frequencies_cpu):
#         value_frequencies[i][value] = freq

# # 打印每个列表中每个值的频率
# for i, freq_dict in enumerate(value_frequencies):
#     print(f"List {i} Frequencies:")
#     for value, freq in freq_dict.items():
#         print(f"Value: {value}, Frequency: {freq}")

t2 = time.time()

print(t2-t1)

0.09595084190368652


In [14]:
t1 = time.time()
for data_list in data_lists:
    a = len(set(data_list))
    
t2 = time.time()

print(t2-t1)

0.009045124053955078


In [26]:
import cupy as cp
import numpy as np
import time

t1 = time.time()
# 创建n个随机数据列表
n = 10000
data_lists = np.random.randint(0, 10, size=n)

data_lists_gpu = cp.asarray(data_lists)

for i in range(len(data_lists_gpu)):
    unique = cp.unique(data_lists_gpu[0:i+1])
    # print(unique)

t2 = time.time()

print(t2-t1)

4.149005889892578


In [29]:
import cupy as cp
import numpy as np
import time

t1 = time.time()
# 创建n个随机数据列表
n = 10000
data_lists = np.random.randint(0, 10, size=n)

for i in range(len(data_lists)):
    unique = len(data_lists[0:i+1])
    # print(unique)

t2 = time.time()

print(t2-t1)

0.0036466121673583984


In [31]:
[set()  for _ in range(5)]

[set(), set(), set(), set(), set()]

In [33]:
[[0] * 6] * 2

[[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0]]

In [38]:
a = set()
a.add(1)
a.add(2)
a.add(1)
print(len(a))

2


In [148]:
import cudf

# 创建一个示例的cuDF DataFrame，其中包含一个 datetime 列
data = {'日期时间': ['2023-09-03 08:00:00', '2023-09-03 10:30:00', '2023-09-03 12:45:00'],
        '数值列': [1, 2, 3]}
df = cudf.DataFrame(data)

# 将 '日期时间' 列转换为 datetime 类型
df['日期时间'] = cudf.to_datetime(df['日期时间'])

# 将 '日期时间' 列设置为索引
df = df.set_index('日期时间')

df["count"] = df.rolling("3h").apply("count")

# 打印结果
print(df)


                     数值列  count
日期时间                           
2023-09-03 08:00:00    1      1
2023-09-03 10:30:00    2      2
2023-09-03 12:45:00    3      2


In [156]:
import cudf

# 创建一个示例的cuDF DataFrame，其中包含一个 datetime 列
data = cudf.read_csv("hostone.csv")
df = data

# 将 '日期时间' 列转换为 datetime 类型
df['日期时间'] = cudf.to_datetime(df['error_time'])
df = df.fillna(-1)
# 将 '日期时间' 列设置为索引
df = df.set_index('日期时间')

print(df)

# 定义要删除的列名包含的字符串
string_to_remove = "Unname"

# 使用列表解析来选择要保留的列，并使用 drop() 方法删除其他列
columns_to_keep = [col_name for col_name in df.columns if string_to_remove not in col_name]
df = df[columns_to_keep]

df.to_csv("hostone1.csv")

# df = df[["sid"]]

df["count"] = df["sid"].rolling("3h").apply("count")

# 打印结果
print(df)


                     Unnamed: 0.1  Unnamed: 0          sid  memoryid  rankid  \
日期时间                                                                           
2019-12-19 04:23:00         24092       24092  Server_1000         8       0   

                     bankid    row  col  error_type           error_time  ...  \
日期时间                                                                      ...   
2019-12-19 04:23:00       2  29037  320           1  2019-12-19 04:23:00  ...   

                    e_row_16  e_row_32  e_col_2  e_col_4  e_col_8  e_col_16  \
日期时间                                                                          
2019-12-19 04:23:00        0         0        0        0        0         0   

                     e_col_32  e_col_64  row_cnt  col_cnt  
日期时间                                                       
2019-12-19 04:23:00         0         0        0        0  

[1 rows x 28 columns]
                             sid  memoryid  rankid  bankid    row  col  \


In [165]:
import cudf

# 创建一个示例的cuDF DataFrame，其中包含一个 datetime 列
df = pd.read_csv("df_cudf.csv")
df_cudf = cudf.DataFrame(df)
# 打印结果
print(len(df_cudf))


631187


In [170]:
df_cudf[["sid","row","col", "count", "error_time", "ce_storm_10", "ce_storm_20", "ce_storm_30"]].head(2000).to_csv("test.csv")

In [101]:
def add_ce_storm_features(df_cudf=None):
    df_cudf["error_time"] = cudf.to_datetime(df_cudf["error_time"])
    df_cudf = df_cudf.set_index("error_time")
    # logging.info(df_cudf.head(5))
    df_cudf = df_cudf.fillna(-1)
    df_cudf["count"] = df_cudf.rolling("3h").apply("count")
    for ce_cnt in [10, 20, 24, 30, 40, 50]:
        df_cudf[f"ce_storm_{ce_cnt}"] = df_cudf["count"] >= ce_cnt
    df_cudf = df_cudf.reset_index()
    return df_cudf

data = {'error_time': ['2023-09-03 08:00:00'],
        '数值列': [1]}
df = cudf.DataFrame(data)
# df_cudf["日期时间"] = cudf.to_datetime(df_cudf["日期时间"])
# df_cudf = df_cudf.set_index("日期时间")
# # logging.info(df_cudf.head(5))
# df_cudf = df_cudf.fillna(-1)
# df_cudf["count"] = df_cudf.rolling("3h").apply("count")
# # for ce_cnt in [10, 20, 24, 30, 40, 50]:
# #     df_cudf[f"ce_storm_{ce_cnt}"] = df_cudf["count"] >= ce_cnt
# # # df_cudf = df_cudf.drop(columns=["count"])
# print(df_cudf)
# df_cudf = df_cudf.reset_index()

df_cudf = add_ce_storm_features(df)

print(df_cudf)

           error_time  数值列  count  ce_storm_10  ce_storm_20  ce_storm_24  \
0 2023-09-03 08:00:00    1      1        False        False        False   

   ce_storm_30  ce_storm_40  ce_storm_50  
0        False        False        False  


In [135]:
import cudf

# 创建一个示例的cuDF DataFrame，包含两个时间字段
data = {'error_time': ['2023-09-03 08:00:00', '2023-09-03 10:30:00', '2023-09-03 12:45:00'],
        'new_time': ['2023-09-03 09:15:00', '2023-09-03 09:00:00', '2023-09-03 14:30:00']}
df = cudf.DataFrame(data)

# 将时间字段转换为datetime类型
df['error_time'] = cudf.to_datetime(df['error_time'])
df['new_time'] = cudf.to_datetime(df['new_time'])

# 计算两个时间字段的差值
df['diff'] = df['new_time'] - df['error_time']
df["diff_hour"] = (df['diff'].dt.seconds) / 3600
df["diff_day"] = df['diff'].dt.days
df["diff_day"] = df["diff_day"].apply(lambda x: 1 if x>=0 else -1)
# df["diff_day"] = df["diff_day"].astype(int)
df["diff_hour"] = df["diff_day"] * df["diff_hour"]
df["ce_count"] = len(df[(df['diff_hour'] >= 1) & (df['diff_hour'] <= 2)])
# 提取差值中的小时部分

# 打印结果
print(df)


           error_time            new_time              diff  diff_hour  \
0 2023-09-03 08:00:00 2023-09-03 09:15:00   0 days 01:15:00       1.25   
1 2023-09-03 10:30:00 2023-09-03 09:00:00 -1 days +22:30:00     -22.50   
2 2023-09-03 12:45:00 2023-09-03 14:30:00   0 days 01:45:00       1.75   

   diff_day  ce_count  
0         1         2  
1        -1         2  
2         1         2  


In [130]:
import cudf

# 创建一个示例的 cuDF DataFrame，包含两个日期时间列
data = {'开始时间': ['2023-09-03 08:00:00', '2023-09-03 10:30:00'],
        '结束时间': ['2023-09-03 09:15:00', '2023-09-03 10:00:00']}
df = cudf.DataFrame(data)

# 将时间字段转换为 datetime 类型
df['开始时间'] = cudf.to_datetime(df['开始时间'])
df['结束时间'] = cudf.to_datetime(df['结束时间'])

# 计算两个时间字段的差值，并提取时间差中的总秒数
df['时间差'] = df['结束时间'] - df['开始时间']

df["aas"] = df['时间差'].dt.seconds

# 打印结果
print(df)


                 开始时间                结束时间               时间差    aas
0 2023-09-03 08:00:00 2023-09-03 09:15:00   0 days 01:15:00   4500
1 2023-09-03 10:30:00 2023-09-03 10:00:00 -1 days +23:30:00  84600


In [134]:
import cudf

# 创建一个示例的 cuDF DataFrame，包含布尔值列
data = {'布尔列': [True, False, True, False, True]}
df = cudf.DataFrame(data)

# 统计 '布尔列' 中 True 的个数
true_count = df['布尔列'].sum()

# 打印结果
print("True 的个数:", true_count)


True 的个数: 3


In [7]:
dataset_idx = [1,2,3,4]

In [9]:
dataset_idx[10:]

[]

In [35]:
import pandas as pd

train_set = "/home/rapids/notebooks/workspace/dram/data/processed/train_set.csv"

df = pd.read_csv(train_set)

In [36]:
len(df)

3812058

In [37]:
aaa = df.isna().any()
aaa.to_csv("aaa.csv")

In [43]:
# df.head()

aaa = df.isna().any()
aaa.to_csv("aaa.csv")

bbb = df.fillna(-1)

In [44]:
aaa = bbb.isna().any()
aaa.to_csv("aaa.csv")

In [46]:
df.iloc[:, 14:].head()

Unnamed: 0,DIMM_number,e_row_2,e_row_4,e_row_8,e_row_16,e_row_32,e_col_2,e_col_4,e_col_8,e_col_16,...,server_manufacturer_M1,DRAM_model_A1,DRAM_model_A2,DRAM_model_B1,DRAM_model_B3,DRAM_model_C2,DRAM_model_B2,DRAM_model_C1,server_manufacturer_M2,server_manufacturer_M3
0,12,0,0,0,0,0,0,0,0,0,...,1.0,0.0,1.0,0.0,0.0,0.0,,,,
1,12,0,0,0,0,0,0,0,0,0,...,1.0,0.0,1.0,0.0,0.0,0.0,,,,
2,12,0,0,0,0,0,0,0,0,0,...,1.0,0.0,1.0,0.0,0.0,0.0,,,,
3,12,0,0,0,0,0,0,0,0,0,...,1.0,0.0,1.0,0.0,0.0,0.0,,,,
4,12,0,0,0,0,0,0,0,0,0,...,1.0,0.0,1.0,0.0,0.0,0.0,,,,


In [40]:
asda = df.fillna(-1)

In [41]:
asda.head()

Unnamed: 0,Unnamed: 0.1,error_time,sid,memoryid,rankid,bankid,row,col,error_type,failure_type,...,server_manufacturer_M1,DRAM_model_A1,DRAM_model_A2,DRAM_model_B1,DRAM_model_B3,DRAM_model_C2,DRAM_model_B2,DRAM_model_C1,server_manufacturer_M2,server_manufacturer_M3
0,7,2020-02-15 17:27:29,Server_11438,22,0,7,83915,440,2,-1.0,...,1.0,0.0,1.0,0.0,0.0,0.0,-1.0,-1.0,-1.0,-1.0
1,29,2020-03-08 15:32:33,Server_11438,22,0,7,83915,440,2,-1.0,...,1.0,0.0,1.0,0.0,0.0,0.0,-1.0,-1.0,-1.0,-1.0
2,34,2020-03-13 15:06:25,Server_11438,22,0,7,83915,440,2,-1.0,...,1.0,0.0,1.0,0.0,0.0,0.0,-1.0,-1.0,-1.0,-1.0
3,38,2020-03-17 14:45:32,Server_11438,22,0,7,83915,440,2,-1.0,...,1.0,0.0,1.0,0.0,0.0,0.0,-1.0,-1.0,-1.0,-1.0
4,47,2020-03-26 13:58:31,Server_11438,22,0,7,83915,440,2,-1.0,...,1.0,0.0,1.0,0.0,0.0,0.0,-1.0,-1.0,-1.0,-1.0


In [5]:

import xgboost as xgb
import pandas as pd

In [10]:

booster = xgb.Booster()
model = booster.load_model("/home/rapids/notebooks/workspace/dram/data/model/version1.model")


In [6]:
test_set = "/home/rapids/notebooks/workspace/dram/data/processed/train_set.csv"

test_df = pd.read_csv(test_set)

In [8]:
test_df = test_df.fillna(-1)
test_y = test_df["failed"]
test_x = test_df.iloc[:, 14:]

In [9]:

from sklearn.metrics import accuracy_score, precision_score, recall_score

In [16]:
d_test_x = xgb.DMatrix(test_x)
predictions = booster.predict(d_test_x)


In [17]:
predictions

array([1.5209501e-03, 1.3324211e-03, 1.3324211e-03, ..., 4.0103674e-05,
       4.0103674e-05, 4.0103674e-05], dtype=float32)

In [19]:
precision = precision_score(test_y, predictions)
recall = recall_score(test_y, predictions)

print("precision {}".format(precision))
print("recall {}".format(recall))

ValueError: Classification metrics can't handle a mix of binary and continuous targets

In [20]:


train_dataset = "/home/rapids/notebooks/workspace/dram/data/processed/train_set.csv"
test_dataset = "/home/rapids/notebooks/workspace/dram/data/processed/test_set.csv"

train_set= pd.read_csv(train_dataset)
test_set= pd.read_csv(test_dataset)

In [48]:
train_set = train_set.fillna(-1)
train_x = train_set.iloc[:, 14:64]
train_y = train_set["failed"]

test_set = test_set.fillna(-1)
test_x = test_set.iloc[:, 14:64]
test_y = test_set["failed"]

In [49]:
train_x.columns

Index(['DIMM_number', 'e_row_2', 'e_row_4', 'e_row_8', 'e_row_16', 'e_row_32',
       'e_col_2', 'e_col_4', 'e_col_8', 'e_col_16', 'e_col_32', 'e_col_64',
       'row_cnt', 'col_cnt', 'count', 'ce_storm_10', 'ce_storm_20',
       'ce_storm_24', 'ce_storm_30', 'ce_storm_40', 'ce_storm_50',
       'e_bank_2_2', 'e_bank_2_4', 'e_bank_2_8', 'e_bank_2_16', 'e_bank_2_32',
       'e_bank_2_64', 'e_bank_4_2', 'e_bank_4_4', 'e_bank_4_8', 'e_bank_4_16',
       'e_bank_4_32', 'e_bank_4_64', 'e_bank_8_2', 'e_bank_8_4', 'e_bank_8_8',
       'e_bank_8_16', 'e_bank_8_32', 'e_bank_8_64', 'e_bank_16_2',
       'e_bank_16_4', 'e_bank_16_8', 'e_bank_16_16', 'e_bank_16_32',
       'e_bank_16_64', 'e_bank_32_2', 'e_bank_32_4', 'e_bank_32_8',
       'e_bank_32_16', 'e_bank_32_32'],
      dtype='object')

In [47]:
test_x.columns

Index(['DIMM_number', 'e_row_2', 'e_row_4', 'e_row_8', 'e_row_16', 'e_row_32',
       'e_col_2', 'e_col_4', 'e_col_8', 'e_col_16', 'e_col_32', 'e_col_64',
       'row_cnt', 'col_cnt', 'count', 'ce_storm_10', 'ce_storm_20',
       'ce_storm_24', 'ce_storm_30', 'ce_storm_40', 'ce_storm_50',
       'e_bank_2_2', 'e_bank_2_4', 'e_bank_2_8', 'e_bank_2_16', 'e_bank_2_32',
       'e_bank_2_64', 'e_bank_4_2', 'e_bank_4_4', 'e_bank_4_8', 'e_bank_4_16',
       'e_bank_4_32', 'e_bank_4_64', 'e_bank_8_2', 'e_bank_8_4', 'e_bank_8_8',
       'e_bank_8_16', 'e_bank_8_32', 'e_bank_8_64', 'e_bank_16_2',
       'e_bank_16_4', 'e_bank_16_8', 'e_bank_16_16', 'e_bank_16_32',
       'e_bank_16_64', 'e_bank_32_2', 'e_bank_32_4', 'e_bank_32_8',
       'e_bank_32_16', 'e_bank_32_32', 'server_manufacturer_M3'],
      dtype='object')

In [50]:

model = xgb.XGBClassifier()
model.fit(train_x, train_y)
preds = model.predict(test_x)

In [55]:
len(preds)

450606

In [56]:
preds

array([0, 0, 0, ..., 0, 0, 0])

In [52]:
precision = precision_score(test_y, preds)
recall = recall_score(test_y, preds)

print("precision {}".format(precision))
print("recall {}".format(recall))

precision 0.11211211211211211
recall 0.00788343774195819


In [57]:

test_validate = test_set.iloc[:, :14]
test_validate["pred"] = preds

In [58]:
test_validate

Unnamed: 0.2,Unnamed: 0.1,error_time,sid,memoryid,rankid,bankid,row,col,error_type,failure_type,failed_time,time_diff,failed,Unnamed: 0,pred
0,7,2019-12-31 03:12:14,Server_29304,6,1,7,71353,552,2,-1.0,2023-01-01 00:00:00,1096 days 20:47:46,0,7,0
1,29,2020-02-13 16:53:51,Server_29304,6,1,7,71353,552,2,-1.0,2023-01-01 00:00:00,1052 days 07:06:09,0,29,0
2,34,2020-02-19 15:31:24,Server_29304,6,1,7,71353,552,2,-1.0,2023-01-01 00:00:00,1046 days 08:28:36,0,34,0
3,38,2020-03-06 11:51:33,Server_29304,6,1,7,71353,552,2,-1.0,2023-01-01 00:00:00,1030 days 12:08:27,0,38,0
4,47,2020-03-29 06:35:29,Server_29304,6,1,7,71353,552,2,-1.0,2023-01-01 00:00:00,1007 days 17:24:31,0,47,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
450601,3723584,2020-05-01 19:24:06,Server_30016,18,0,5,28548,1016,2,-1.0,2023-01-01 00:00:00,974 days 04:35:54,0,3723584,0
450602,3723591,2020-05-01 19:24:11,Server_30016,18,0,5,28555,1016,2,-1.0,2023-01-01 00:00:00,974 days 04:35:49,0,3723591,0
450603,3723604,2020-05-01 19:24:16,Server_30016,18,0,5,28577,1016,2,-1.0,2023-01-01 00:00:00,974 days 04:35:44,0,3723604,0
450604,3723629,2020-05-01 19:24:37,Server_30016,18,0,5,28647,1016,2,-1.0,2023-01-01 00:00:00,974 days 04:35:23,0,3723629,0


In [61]:
fin_reals = []
fin_preds = []
for host in test_validate["sid"].drop_duplicates().to_list():
    df = test_validate[test_validate["sid"]==host]
    tmp = df[df["pred"] == 1]
    if (len(tmp)) > 0:
        fin_preds.append(1)
    else:
        fin_preds.append(0)
        
    if len(test_validate["failed_time"].drop_duplicates().to_list()) == 1:
        fin_reals.append(0)
    else:
        fin_reals.append(1)

In [62]:
precision = precision_score(fin_reals, fin_preds)
recall = recall_score(fin_reals, fin_preds)

print("precision {}".format(precision))
print("recall {}".format(recall))

precision 1.0
recall 0.05046728971962617


In [69]:
train_set = train_set.fillna(-1)
train_x = train_set[["count","e_row_4","e_row_8", "row_cnt"]]
train_y = train_set["failed"]

test_set = test_set.fillna(-1)
test_x = test_set[["count","e_row_4","e_row_8", "row_cnt"]]
test_y = test_set["failed"]

model = xgb.XGBClassifier()
model.fit(train_x, train_y)
preds = model.predict(test_x)

test_validate = test_set.iloc[:, :14]
test_validate["pred"] = preds

fin_reals = []
fin_preds = []
for host in test_validate["sid"].drop_duplicates().to_list():
    df = test_validate[test_validate["sid"]==host]
    tmp = df[df["pred"] == 1]
    if (len(tmp)) > 0:
        fin_preds.append(1)
    else:
        fin_preds.append(0)
        
    if len(test_validate["failed_time"].drop_duplicates().to_list()) == 1:
        fin_reals.append(0)
    else:
        fin_reals.append(1)
        
precision = precision_score(fin_reals, fin_preds)
recall = recall_score(fin_reals, fin_preds)

print("precision {}".format(precision))
print("recall {}".format(recall))

precision 1.0
recall 0.019626168224299065


In [66]:
train_x = train_set.iloc[:, 14:64]
train_y = train_set["failed"]

numerical_features = pd.concat([train_x, train_y], axis=1)

correlations = numerical_features.corr()["failed"]
correlations.drop(["failed"], inplace=True)
correlations = correlations.apply(lambda x: abs(x))
correlations = correlations.sort_values(ascending=False)

correlations

count           0.185697
e_row_4         0.155286
e_row_8         0.144651
e_row_16        0.128100
row_cnt         0.125334
e_col_2         0.121336
e_row_2         0.116975
e_bank_2_2      0.113426
e_row_32        0.110588
e_col_4         0.109335
col_cnt         0.107293
e_bank_32_32    0.104681
e_bank_32_8     0.104462
e_bank_32_16    0.104381
e_bank_2_4      0.103207
e_bank_32_4     0.098115
e_bank_16_32    0.096141
e_bank_16_64    0.094541
e_bank_16_16    0.094174
e_bank_16_8     0.094003
e_bank_32_2     0.091889
e_col_64        0.089478
e_bank_8_64     0.088857
e_col_32        0.087711
e_bank_16_4     0.087659
e_bank_8_32     0.087479
e_bank_2_64     0.086454
e_bank_4_64     0.085898
e_bank_2_32     0.084159
e_bank_4_32     0.082753
e_bank_8_16     0.082515
e_col_16        0.082463
e_bank_8_8      0.081401
e_col_8         0.081197
e_bank_16_2     0.081177
e_bank_2_16     0.077503
e_bank_4_16     0.076027
e_bank_2_8      0.075824
e_bank_8_4      0.075091
e_bank_4_8      0.073690
