In [1]:
import os
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from models.model_logrobust import logrobust_model


In [2]:
# Function to read and parse log files
def read_logs(log_dir):
    log = []
    for file in os.listdir(log_dir):
        with open(os.path.join(log_dir, file), 'r') as f:
            log.extend(f.readlines())
    return log

# Function to preprocess logs
def preprocess_logs(logs):
    processed_logs = []
    for log in logs:
        # Remove timestamps and other non-textual information
        log = re.sub(r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2},\d{3}', '', log)
        log = re.sub(r'\[.*?\]', '', log)
        log = re.sub(r'org.apache.hadoop.*? ', '', log)
        log = re.sub(r'\s+', ' ', log).strip()
        processed_logs.append(log)
    return processed_logs

# Function to read anomaly labels
def read_labels(label_file):
    labels_df = pd.read_csv(label_file)
    return labels_df

In [3]:
files = os.listdir('Hadoop')
# only consider directories
dirs = [f for f in files if os.path.isdir(os.path.join('Hadoop', f))]

logs = {}
for app in dirs:
    logs[app] = read_logs(os.path.join('Hadoop', app))

# Preprocess logs
processed_logs = {}
for app in logs:
    processed_logs[app] = preprocess_logs(logs[app])

# Read labels
labels_df = read_labels('./Hadoop/anamoly_label.csv')


In [4]:
# Create a dataframe with logs and labels matched with Application key from logs dict and the Application column in labels_df
data = []

for app in processed_logs:
    for log in processed_logs[app]:
        data.append([log, app])

data_df = pd.DataFrame(data, columns=['log', 'app'])

# Merge data_df with labels_df
data_df = data_df.merge(labels_df, left_on='app', right_on='Application', how='left')
data_df = data_df.drop(columns=['Application','app'])

In [23]:
# Split data into training and testing sets
X = data_df['log']
y = data_df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Convert X_train and X_test into pandas dataframes
X_train_df = pd.DataFrame(X_train, columns=['log'])
X_test_df = pd.DataFrame(X_test, columns=['log'])

In [24]:
X_test_df['log']

13220       INFO Scheduled snapshot period at 10 second(s).
80010     at org.apache.hadoop.ipc.Client.call(Client.ja...
167393    at org.apache.hadoop.hdfs.protocolPB.ClientNam...
246492    INFO (RESET) equator 29285595 kv 7321392(29285...
379917    INFO Recalculating schedule, headroom=<memory:...
                                ...                        
300509    at sun.reflect.GeneratedMethodAccessor5.invoke...
53724     at org.apache.hadoop.hdfs.DFSClient.renewLease...
196814    at com.sun.proxy.$Proxy15.renewLease(Unknown S...
85936     INFO Received completed container container_14...
331536    INFO Opening proxy : MININT-FNANLI5.fareast.co...
Name: log, Length: 118293, dtype: object

In [13]:
print('Shape of training data:', X_train.shape)
print('Shape of testing data:', X_test.shape)

Shape of training data: (276017, 1)
Shape of testing data: (118293,)


In [6]:
# Vectorize logs with 2 different vectorizers
tfidf_vectorizer = TfidfVectorizer(max_features=300)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

count_vectorizer = CountVectorizer(max_features=300)
X_train_count = count_vectorizer.fit_transform(X_train)

# Train a logistic regression model
model_tfidf = LogisticRegression()
model_tfidf.fit(X_train_tfidf, y_train)

model_count = LogisticRegression()
model_count.fit(X_train_count, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [13]:
import numpy as np
# LogRobust model
model_logrobust = logrobust_model(300, 128)

model_logrobust.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model_logrobust.fit(np.array(X_train_tfidf), y_train, epochs=10, batch_size=32, validation_split=0.2)

IndexError: tuple index out of range

In [7]:
# Setup an Evaluation Pipeline function for the models
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    print('Accuracy: ', accuracy_score(y_test, y_pred))
    print('Confusion Matrix: \n', confusion_matrix(y_test, y_pred))
    print('Classification Report: \n', classification_report(y_test, y_pred))

# Evaluate the models
X_test_tfidf = tfidf_vectorizer.transform(X_test)
X_test_count = count_vectorizer.transform(X_test)

print('TF-IDF Vectorizer')
evaluate_model(model_tfidf, X_test_tfidf, y_test)

print('Count Vectorizer')
evaluate_model(model_count, X_test_count, y_test)

TF-IDF Vectorizer
Accuracy:  0.8078330924061441
Confusion Matrix: 
 [[  608  7573   156     4]
 [  318 29732   784    19]
 [  192  6166 65160     1]
 [  140  7267   112    61]]
Classification Report: 
                        precision    recall  f1-score   support

            Disk full       0.48      0.07      0.13      8341
         Machine down       0.59      0.96      0.73     30853
Network disconnection       0.98      0.91      0.95     71519
               Normal       0.72      0.01      0.02      7580

             accuracy                           0.81    118293
            macro avg       0.69      0.49      0.45    118293
         weighted avg       0.83      0.81      0.77    118293

Count Vectorizer
Accuracy:  0.807909174676439
Confusion Matrix: 
 [[  601  7570   170     0]
 [  337 29743   757    16]
 [  186  6165 65167     1]
 [  145  7274   102    59]]
Classification Report: 
                        precision    recall  f1-score   support

            Disk full      

In [17]:
import torch
from transformers import AutoTokenizer, AutoModel

device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')
print('Device:', device)

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = AutoModel.from_pretrained("distilbert-base-uncased").to(device)

Device: mps


In [34]:
tokenized_train = tokenizer(X_train_df["log"].values.tolist(), padding = True, truncation = True, return_tensors="pt")
tokenized_val = tokenizer(X_test_df["log"].values.tolist() , padding = True, truncation = True,  return_tensors="pt")

#move on device (GPU)
tokenized_train = {k:torch.tensor(v).to(device) for k,v in tokenized_train.items()}
tokenized_val = {k:torch.tensor(v).to(device) for k,v in tokenized_val.items()}

In [31]:
tokenized_train['input_ids'].shape

torch.Size([276017, 187])

In [33]:

batch_size = 32

with torch.no_grad():
  hidden_train = []
  hidden_val = []
  
  for i in range(0, len(X_train), batch_size):
    batch_train = {
      'input_ids': tokenized_train['input_ids'][i:i+batch_size],
      'attention_mask': tokenized_train['attention_mask'][i:i+batch_size]
    }

    batch_val = {
      'input_ids': tokenized_val['input_ids'][i:i+batch_size],
      'attention_mask': tokenized_val['attention_mask'][i:i+batch_size]
    }
    
    hidden_train_batch = model(**batch_train)
    hidden_val_batch = model(**batch_val)
    
    hidden_train.append(hidden_train_batch.last_hidden_state[:, 0, :])
    hidden_val.append(hidden_val_batch.last_hidden_state[:, 0, :])
  
  hidden_train = torch.cat(hidden_train, dim=0)
  hidden_val = torch.cat(hidden_val, dim=0)

#get only the [CLS] hidden states
cls_train = hidden_train.last_hidden_state[:,0,:]
cls_val = hidden_val.last_hidden_state[:,0,:]

RuntimeError: Placeholder storage has not been allocated on MPS device!