In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
# Load your log data
df_t = pd.read_csv('final_dataset_for_model.csv')

In [6]:
# Define parameters
window_size = 5  # Number of events per sequence

In [8]:
# Step 1: Create combined sequences with sliding windows
sequences = []
labels = []

In [10]:
for i in range(len(df_t) - window_size):
    # Combine messages in the window into a single string
    combined_sequence = " ".join(df_t['anonymized_message'].iloc[i:i + window_size].tolist())
    # Label the sequence based on the final message's label
    label = df_t['labels'].iloc[i + window_size - 1]
    
    sequences.append(combined_sequence)
    labels.append(label)

In [12]:
# Step 2: Vectorize combined sequences
tfidf_vectorizer = TfidfVectorizer(max_features=500)  # Adjust max_features as needed
X = tfidf_vectorizer.fit_transform(sequences)

In [14]:
# Convert to DataFrame
X_df = pd.DataFrame(X.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
X_df['labels'] = labels

In [16]:
# Split data into features and labels
X = X_df.drop(columns=['labels'])
y = X_df['labels']

In [18]:
print("Sequences prepared for modeling. Shape of X:", X.shape)
print("Shape of y:", y.shape)

Sequences prepared for modeling. Shape of X: (180891, 500)
Shape of y: (180891,)


In [20]:
from sklearn.model_selection import train_test_split

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training and testing sets created.")
print("Training set shape:", X_train.shape)
print("Testing set shape:", X_test.shape)


Training and testing sets created.
Training set shape: (144712, 500)
Testing set shape: (36179, 500)


In [22]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Initialize the Random Forest classifier
rf_model = RandomForestClassifier(random_state=42, class_weight='balanced')  # Using class weights to handle imbalance

# Train the model
rf_model.fit(X_train, y_train)

# Predict on test data
y_pred = rf_model.predict(X_test)

# Evaluate model performance
print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Classification Report:
                           precision    recall  f1-score   support

            ['Disk full']       0.22      0.44      0.29      5357
         ['Machine down']       0.77      0.25      0.37     19531
['Network disconnection']       0.75      0.43      0.55      6372
               ['Normal']       0.21      0.66      0.32      4919

                 accuracy                           0.36     36179
                macro avg       0.49      0.44      0.38     36179
             weighted avg       0.61      0.36      0.38     36179

Confusion Matrix:
[[2337  578  143 2299]
 [5693 4817  695 8326]
 [1353  416 2751 1852]
 [1116  444   90 3269]]


In [24]:
X

Unnamed: 0,_m_,_r_,about,acknowledgement,ad,added,adding,address,addspeculativeattempt,after,...,we,web,webapp,webproxy,will,windowsbasedprocesstree,with,writer,yarn,yarn_am_rm_token
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.110262,0.0,0.0,0.244648
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.103138,0.0,0.0,0.228841
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.175501
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
180886,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000
180887,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000
180888,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000
180889,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000


In [26]:
y

0               ['Normal']
1               ['Normal']
2               ['Normal']
3               ['Normal']
4               ['Normal']
                ...       
180886    ['Machine down']
180887    ['Machine down']
180888    ['Machine down']
180889    ['Machine down']
180890    ['Machine down']
Name: labels, Length: 180891, dtype: object