In [71]:
import pandas as pd
from sklearn.model_selection import train_test_split
import ast
import numpy as np

In [72]:
#This is the function that creates the count vectors
def count_vector(event_list, n):
    vector = np.zeros(n)
    for event in event_list:
        index = int(event[1:]) - 1  # Convert 'E1' to 0, 'E2' to 1, etc.
        if 0 <= index < n:
            vector[index] += 1
    return vector

In [76]:
def train_test_split_for_data(path: str, test_size: float):
    #Loading in the data
    data_df = pd.read_csv(path)
    #Converting the labels to binary numbers, 0 for success, 1 for failure
    mask = data_df['Final Label'] == 'Success'
    data_df.loc[mask, 'label'] = 0
    data_df.loc[~mask, 'label'] = 1
    #I do not need the index column
    data_df = data_df.reset_index(drop=True)

    #Once converted, the strings of the event IDs are also unnecessary
    data_df['Events']  = data_df['New Event ID'].apply(ast.literal_eval)
    #Calculating the maximum value of the En type events
    max_n = max(int(e[1:]) for sublist in data_df['Events'] for e in sublist)
    data_df['Event_Count_Vector'] = data_df['Events'].apply(lambda x: count_vector(x, max_n))
    data_df.drop(columns=['Final Label', 'Unnamed: 0', 'New Event ID', 'Events', 'BlockId'], inplace=True)
    count_vector_df = pd.DataFrame(data_df['Event_Count_Vector'].tolist(), index=data_df.index)

    #For logistic regression
    count_vector_df.columns = [f'feature_{i+1}' for i in range(count_vector_df.shape[1])]
    data_df = pd.concat([data_df, count_vector_df], axis=1)
    data_df.drop(columns='Event_Count_Vector', inplace = True)

    success_df = data_df.loc[data_df['label'] == 0].copy(deep=True)
    fail_df = data_df.loc[data_df['label'] == 1].copy(deep=True)
    success_df_label = success_df['label'].copy(deep=True)
    success_df.drop(columns='label', inplace=True)
    fail_df_label = fail_df['label'].copy(deep=True)
    fail_df.drop(columns='label', inplace=True)
    x_train_success, x_test_success,y_train_success, y_test_success = train_test_split(success_df, success_df_label, test_size=test_size, shuffle=True, random_state=42)
    x_train_fail, x_test_fail, y_train_fail, y_test_fail = train_test_split(fail_df, fail_df_label, test_size=test_size, shuffle = True, random_state=42)
    x_train = pd.concat([x_train_success, x_train_fail], ignore_index=True)
    x_test = pd.concat([x_test_success, x_test_fail], ignore_index=True)
    y_train = pd.concat([y_train_success, y_train_fail], ignore_index=True)
    y_test = pd.concat([y_test_success, y_test_fail], ignore_index=True)
    return x_train, x_test, y_train, y_test

In [77]:
x_train, x_test, y_train, y_test = train_test_split_for_data('../Data/HDFS_v1/Processed_data/processed_labeled_data.csv', 0.2)
x_train.head()

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,...,feature_39,feature_40,feature_41,feature_42,feature_43,feature_44,feature_45,feature_46,feature_47,feature_48
0,3.0,1.0,3.0,3.0,3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,3.0,1.0,3.0,3.0,3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3.0,1.0,3.0,3.0,3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3.0,1.0,3.0,3.0,3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,3.0,1.0,3.0,3.0,3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
