In [None]:
!pip install -q scikit-learn pandas xgboost

# Import libraries
import pandas as pd
from datetime import datetime
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# -------------------------------------
# Step 1: Upload and Load the Linux logs CSV file
from google.colab import files
uploaded = files.upload()  # When prompted, upload your "linux_logs.csv" file

Saving linux_logs.csv to linux_logs.csv


In [None]:
df = pd.read_csv("linux_logs.csv")

# Inspect the data
print("First few rows:")
print(df.head())
print("\nData Info:")
print(df.info())

First few rows:
         Timestamp Hostname        Service  \
0  Jun  9 06:06:20    combo  syslogd 1.4.1   
1  Jun  9 06:06:20    combo         syslog   
2  Jun  9 06:06:20    combo         syslog   
3  Jun  9 06:06:20    combo         kernel   
4  Jun  9 06:06:20    combo         kernel   

                                             Message  
0                                           restart.  
1                          syslogd startup succeeded  
2                            klogd startup succeeded  
3      klogd 1.4.1, log source = /proc/kmsg started.  
4  Linux version 2.6.5-1.358 (bhcompile@bugs.buil...  

Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25549 entries, 0 to 25548
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Timestamp  25549 non-null  object
 1   Hostname   25549 non-null  object
 2   Service    25549 non-null  object
 3   Message    25456 non-null  object
dtypes: object(4)
memor

In [None]:
# Step 2: Data Cleaning & Preprocessing
# Fill missing values
df.fillna("Unknown", inplace=True)

# Convert Timestamp column to datetime (if not already in datetime format)
df['Timestamp'] = pd.to_datetime(df['Timestamp'], errors='coerce')

# Extract time features: hour and minute
df['hour'] = df['Timestamp'].dt.hour
df['minute'] = df['Timestamp'].dt.minute

# Create a combined text field from 'Service' and 'Message'
df['combined_text'] = df['Service'].astype(str) + " " + df['Message'].astype(str)


  df['Timestamp'] = pd.to_datetime(df['Timestamp'], errors='coerce')


In [None]:
# Step 3: Create a Target Label
# Use a heuristic: if the combined text contains keywords indicating issues,
# then label the log as an "issue" (1), otherwise as normal (0).
keywords = ['error', 'failed', 'critical', 'denied']
df['issue'] = df['combined_text'].apply(
    lambda x: 1 if any(word in x.lower() for word in keywords) else 0
)

print("\nLabel distribution:")
print(df['issue'].value_counts())


Label distribution:
issue
0    25278
1      271
Name: count, dtype: int64


In [None]:
# Step 4: Feature Selection
# We'll use the combined text along with the extracted hour and minute as features.
X = df[['combined_text', 'hour', 'minute']]
y = df['issue']


In [None]:
# Step 5: Split Data into Training and Testing Sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

In [None]:
# Step 6: Build a Pipeline with Feature Processing and XGBoost Classification
# For text data, use TfidfVectorizer on 'combined_text'.
# For numeric features ('hour' and 'minute'), use StandardScaler.
preprocessor = ColumnTransformer(transformers=[
    ('text', TfidfVectorizer(), 'combined_text'),
    ('num', StandardScaler(), ['hour', 'minute'])
])

# Create a pipeline combining the preprocessor and an XGBoost classifier.
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier(
        n_estimators=100,
        use_label_encoder=False,
        eval_metric='logloss',
        random_state=42))
])


In [None]:
# Step 7: Train the Model
pipeline.fit(X_train, y_train)


  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
Parameters: { "use_label_encoder" } are not used.



In [None]:
y_pred = pipeline.predict(X_test)

# -------------------------------------
# Step 9: Evaluate the Model
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      7584
           1       1.00      0.99      0.99        81

    accuracy                           1.00      7665
   macro avg       1.00      0.99      1.00      7665
weighted avg       1.00      1.00      1.00      7665

Accuracy: 0.9998695368558382
Confusion Matrix:
[[7584    0]
 [   1   80]]
