# remove column

In [2]:
import pandas as pd

# Load your dataset
df = pd.read_csv("D:\\Final Year\\Major Project\\dataset\\merged_versions\\anonymized_merged_logs_2.csv")

# List of columns to drop
columns_to_drop = ['thread', 'class', 'message', 'labels', 'Disk full', 'Machine down', 'Network disconnection', 'Normal']

# Drop the specified columns
df = df.drop(columns=columns_to_drop)

In [24]:
df

Unnamed: 0,level,anonymized_message
0,INFO,Created MRAppMaster for application appattempt...
1,INFO,Executing with tokens:
2,INFO,"Kind: YARN_AM_RM_TOKEN, Service: , Ident: (app..."
3,INFO,Using mapred newApiCommitter.
4,INFO,OutputCommitter set in config null
...,...,...
180891,INFO,bufstart = -; bufend = -; bufvoid = -
180892,INFO,kvstart = -(-); kvend = -(-); length = -file_path
180893,INFO,Finished spill -
180894,INFO,Merging - sorted segments


In [28]:
df.to_csv('removed_dataset.csv', index=False)

In [10]:
df['level'].value_counts()

level
INFO     168920
WARN      11438
ERROR       538
Name: count, dtype: int64

In [12]:
info_rows = df[df['level'] == 'INFO'].iloc[:13920]

# Filter other levels
other_rows = df[df['level'] != 'INFO']

# Combine back the rows with limited 'INFO' occurrences and other rows
result_df = pd.concat([info_rows, other_rows])

In [16]:
result_df['level'].value_counts()

level
INFO     13920
WARN     11438
ERROR      538
Name: count, dtype: int64

In [18]:
result_df.to_csv('limited_info_dataset.csv', index=False)

In [22]:
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report
import pandas as pd
import joblib

# Load the filtered dataset
data = pd.read_csv('limited_info_dataset.csv')

# Preprocess the data
X = data['anonymized_message']
y = data['level']  # Use 'level' as the target column

# TF-IDF Vectorization
tfidf = TfidfVectorizer(stop_words='english', max_features=10000, ngram_range=(1, 2))
X_tfidf = tfidf.fit_transform(X)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

# Define parameter grid for GridSearchCV
param_grid = {
    'C': [0.1, 1, 10, 100],  # Regularization parameter
    'kernel': ['linear', 'rbf'],  # Different kernels for SVM
    'gamma': ['scale', 'auto']  # Kernel coefficient
}

# Initialize and perform grid search on SVM
svm = SVC(random_state=42, class_weight='balanced')
grid_search = GridSearchCV(svm, param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Predict on test set
y_pred = grid_search.best_estimator_.predict(X_test)

# Evaluate the model
print("Best Parameters:", grid_search.best_params_)
print("Classification Report:\n", classification_report(y_test, y_pred))

# Save the best model and vectorizer as .pkl files
joblib.dump(grid_search.best_estimator_, 'svm_classifier_model.pkl')  # Save the model
joblib.dump(tfidf, 'tfidf_vectorizer.pkl')  # Save the vectorizer

print("Model and vectorizer saved as 'svm_classifier_model.pkl' and 'tfidf_vectorizer.pkl'")


Fitting 3 folds for each of 16 candidates, totalling 48 fits
Best Parameters: {'C': 10, 'gamma': 'scale', 'kernel': 'linear'}
Classification Report:
               precision    recall  f1-score   support

       ERROR       1.00      0.97      0.98       100
        INFO       1.00      1.00      1.00      2776
        WARN       1.00      1.00      1.00      2304

    accuracy                           1.00      5180
   macro avg       1.00      0.99      0.99      5180
weighted avg       1.00      1.00      1.00      5180

Model and vectorizer saved as 'svm_classifier_model.pkl' and 'tfidf_vectorizer.pkl'


In [42]:
import pandas as pd

# Assuming df_t is the dataframe containing your log data
# Filter the rows with level 'INFO' or 'WARN'
df_filtered = df[df['level'].isin(['INFO', 'WARN'])]

# Separate the INFO and WARN rows
info_rows = df_filtered[df_filtered['level'] == 'INFO']
warn_rows = df_filtered[df_filtered['level'] == 'WARN']

# Keep first 5000 INFO and WARN rows
info_first_5000 = info_rows.head(5000)
warn_first_5000 = warn_rows.head(5000)

# Get the remaining rows that are not 'INFO' or 'WARN'
df_remaining = df[~df['level'].isin(['INFO', 'WARN'])]

# Concatenate first 5000 rows of INFO, first 5000 rows of WARN, and remaining rows
df_final = pd.concat([info_first_5000, warn_first_5000, df_remaining])

# Shuffle the rows if needed, to maintain randomness in the final dataset
df_final = df_final.sample(frac=1).reset_index(drop=True)

# df_final will now contain the first 5000 'INFO', first 5000 'WARN', and all other rows


In [44]:
df_final['level'].value_counts()

level
WARN     5000
INFO     5000
ERROR     538
Name: count, dtype: int64

In [46]:
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report
import pandas as pd
import joblib

# Load the filtered dataset
data = df_final

# Preprocess the data
X = data['anonymized_message']
y = data['level']  # Use 'level' as the target column

# TF-IDF Vectorization
tfidf = TfidfVectorizer(stop_words='english', max_features=10000, ngram_range=(1, 2))
X_tfidf = tfidf.fit_transform(X)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

# Define parameter grid for GridSearchCV
param_grid = {
    'C': [0.1, 1, 10, 100],  # Regularization parameter
    'kernel': ['linear', 'rbf'],  # Different kernels for SVM
    'gamma': ['scale', 'auto']  # Kernel coefficient
}

# Initialize and perform grid search on SVM
svm = SVC(random_state=42, class_weight='balanced')
grid_search = GridSearchCV(svm, param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Predict on test set
y_pred = grid_search.best_estimator_.predict(X_test)

# Evaluate the model
print("Best Parameters:", grid_search.best_params_)
print("Classification Report:\n", classification_report(y_test, y_pred))

# Save the best model and vectorizer as .pkl files
joblib.dump(grid_search.best_estimator_, 'svm_classifier_model.pkl')  # Save the model
joblib.dump(tfidf, 'tfidf_vectorizer.pkl')  # Save the vectorizer

print("Model and vectorizer saved as 'svm_classifier_model.pkl' and 'tfidf_vectorizer.pkl'")


Fitting 3 folds for each of 16 candidates, totalling 48 fits
Best Parameters: {'C': 1, 'gamma': 'scale', 'kernel': 'linear'}
Classification Report:
               precision    recall  f1-score   support

       ERROR       1.00      0.98      0.99       104
        INFO       0.99      1.00      1.00       961
        WARN       1.00      1.00      1.00      1043

    accuracy                           1.00      2108
   macro avg       1.00      0.99      0.99      2108
weighted avg       1.00      1.00      1.00      2108

Model and vectorizer saved as 'svm_classifier_model.pkl' and 'tfidf_vectorizer.pkl'


In [48]:
df_final.to_csv('500500_dataset.csv', index=False)

In [50]:
df_final

Unnamed: 0,level,anonymized_message
0,WARN,Address change detected. Old: msra-sa--file_pa...
1,WARN,Address change detected. Old: msra-sa--file_pa...
2,WARN,Failed to renew lease for [DFSClient_NONMAPRED...
3,INFO,Adding job token for job_id to jobTokenSecretM...
4,WARN,Failed to renew lease for [DFSClient_NONMAPRED...
...,...,...
10533,WARN,Failed to renew lease for [DFSClient_NONMAPRED...
10534,WARN,Address change detected. Old: msra-sa--file_pa...
10535,WARN,Address change detected. Old: msra-sa--file_pa...
10536,WARN,Failed to renew lease for [DFSClient_NONMAPRED...


In [56]:
info_logs = df_final[df_final['level'] == 'INFO'][['level', 'anonymized_message']]
warn_logs = df_final[df_final['level'] == 'WARN'][['level', 'anonymized_message']]
error_logs = df_final[df_final['level'] == 'ERROR'][['level', 'anonymized_message']]

In [60]:
info_logs_json = info_logs.to_json(orient='records', lines=True)

In [62]:
info_logs_json

'{"level":"INFO","anonymized_message":"Adding job token for job_id to jobTokenSecretManager"}\n{"level":"INFO","anonymized_message":"Progress of TaskAttempt task_attempt is : -.-"}\n{"level":"INFO","anonymized_message":"getResources() for application_id: ask=- release= - newContainers=- finishedContainers=- resourcelimit=<memory:-, vCores:--> knownNMs=-"}\n{"level":"INFO","anonymized_message":"Progress of TaskAttempt task_attempt is : -.-"}\n{"level":"INFO","anonymized_message":"task_attempt TaskAttempt Transitioned from NEW to UNASSIGNED"}\n{"level":"INFO","anonymized_message":"Progress of TaskAttempt attempt_-_-_r_-_- is : -.-"}\n{"level":"INFO","anonymized_message":"ProcfsBasedProcessTree currently is supported only on Linux."}\n{"level":"INFO","anonymized_message":"After Scheduling: PendingReds:- ScheduledMaps:- ScheduledReds:- AssignedMaps:- AssignedReds:- CompletedMaps:- CompletedReds:- ContAlloc:- ContRel:- HostLocal:- RackLocal:-"}\n{"level":"INFO","anonymized_message":"Progres