In [None]:
%pip install seaborn pandas

In [None]:
import os
import pandas as pd

In [9]:
def preprocess_logs(input_dir='extracted', output_dir='preprocessed'):
  """
  Preprocesses multiple processed log datasets in the input directory.

  Args:
  - input_dir: Directory containing processed log CSV files.
  - output_dir: Directory to save preprocessed CSV files.

  Returns:
  - None
  """
  
  if not os.path.exists(output_dir):
    os.makedirs(output_dir)

  # List all CSV files in the input directory
  csv_files = [f for f in os.listdir(input_dir) if f.endswith('.csv')]

  for file in csv_files:
    file_path = os.path.join(input_dir, file)
    print(f"Processing file: {file_path}")
    
    # Load dataset
    df = pd.read_csv(file_path)

    # Handle missing values
    df['response_time'] = df['response_time'].fillna(0)  # Replace missing response times with 0

    # Feature engineering
    # Ensure timestamp is in datetime format
    df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')

    # Sort by timestamp and calculate time delta
    df = df.sort_values(by='timestamp')
    df['time_delta'] = df['timestamp'].diff().dt.total_seconds()
    df['time_delta'] = df['time_delta'].interpolate()
    
    # Check if internal_ip is present
    if 'internal_ip' not in df.columns:
      raise ValueError("internal_ip column is missing from the dataset!")
    
    # Fill missing IPs with placeholder
    df['internal_ip'] = df['internal_ip'].fillna('INTERNAL_PROCESS')

    # Create an error flag
    df['is_error'] = df['log_level'].isin(['ERROR', 'CRITICAL']).astype(int)

    # Normalize response time
    if 'response_time' in df.columns:
      df['response_time_normalized'] = (
        df['response_time'] - df['response_time'].mean()
        ) / df['response_time'].std()

    # Save preprocessed data
    output_file = os.path.join(output_dir, f"preprocessed_{os.path.splitext(file.rsplit('_', 1)[0])[0]}.csv")
    df.to_csv(output_file, index=False)
    print(f"Preprocessed file saved to: {output_file}")

# Main script execution
if __name__ == "__main__":
    preprocess_logs()


Processing file: extracted/openstack_train_extracted.csv
Preprocessed file saved to: preprocessed/preprocessed_openstack_train.csv
Processing file: extracted/openstack_test_extracted.csv
Preprocessed file saved to: preprocessed/preprocessed_openstack_test.csv
Processing file: extracted/openstack_predict_extracted.csv
Preprocessed file saved to: preprocessed/preprocessed_openstack_predict.csv


In [10]:
# missing values check
input_dir='preprocessed'

processed_files = [
    'preprocessed/preprocessed_openstack_predict.csv',
    'preprocessed/preprocessed_openstack_test.csv',
    'preprocessed/preprocessed_openstack_train.csv',
]

for file in processed_files:
    
    print(f"Checking missing values in {file}")
    df = pd.read_csv(file)
    
    missing_summary = df.isnull().sum()
    print(missing_summary[missing_summary > 0])  # Only display columns with missing values
    print("\n")


Checking missing values in preprocessed/preprocessed_openstack_predict.csv
request_id         1439
user_id            1439
project_id         1439
request            9018
status_code        9018
response_length    9018
time_delta            1
dtype: int64


Checking missing values in preprocessed/preprocessed_openstack_test.csv
request_id          4215
user_id             4215
project_id          4215
request            25678
status_code        25688
response_length    25688
time_delta             1
dtype: int64


Checking missing values in preprocessed/preprocessed_openstack_train.csv
request_id          9992
user_id             9992
project_id          9992
request            74317
status_code        74381
response_length    74381
time_delta             1
dtype: int64




In [11]:
# check for valid data types

for file in processed_files:
  
  print(f"Validating data types in {file}")
  df = pd.read_csv(file)
  
  # Convert timestamp to datetime and check
  df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')
  print("Timestamp conversion successful:", df['timestamp'].notnull().all())
  
  # Check numeric columns
  numeric_columns = ['response_time', 'response_length']
  
  for col in numeric_columns:
    if col in df.columns:
      print(f"{col}: Numeric check -", pd.api.types.is_numeric_dtype(df[col]))
  
  print("\n")


Validating data types in preprocessed/preprocessed_openstack_predict.csv
Timestamp conversion successful: True
response_time: Numeric check - True
response_length: Numeric check - True


Validating data types in preprocessed/preprocessed_openstack_test.csv
Timestamp conversion successful: True
response_time: Numeric check - True
response_length: Numeric check - True


Validating data types in preprocessed/preprocessed_openstack_train.csv
Timestamp conversion successful: True
response_time: Numeric check - True
response_length: Numeric check - True




In [12]:
# validating expected ranges

# predefine some known error codes
defaultErrorCodes = [200, 201, 202, 204, 400, 401, 403, 404, 500, 503]

for file in processed_files:
  print(f"Validating data ranges in {file}")
  df = pd.read_csv(file)
  
  # Check response_time
  if 'response_time' in df.columns:
      print("Negative response_time entries:", (df['response_time'] < 0).sum())
  
  # Check status_code
  if 'status_code' in df.columns:
      print("Invalid status codes:", df[~df['status_code'].isin(defaultErrorCodes)]['status_code'].unique())
  
  # Check response_length
  if 'response_length' in df.columns:
      print("Negative response_length entries:", (df['response_length'] < 0).sum())
  
  print("\n")


Validating data ranges in preprocessed/preprocessed_openstack_predict.csv
Negative response_time entries: 0
Invalid status codes: [nan]
Negative response_length entries: 0


Validating data ranges in preprocessed/preprocessed_openstack_test.csv
Negative response_time entries: 0
Invalid status codes: [nan]
Negative response_length entries: 0


Validating data ranges in preprocessed/preprocessed_openstack_train.csv
Negative response_time entries: 0
Invalid status codes: [nan]
Negative response_length entries: 0




In [14]:
# Compare columns across datasets

df_train = pd.read_csv("preprocessed/preprocessed_openstack_predict.csv")
df_test = pd.read_csv("preprocessed/preprocessed_openstack_test.csv")
df_predict = pd.read_csv("preprocessed/preprocessed_openstack_train.csv")

print("Train columns:", df_train.columns)
print("Test columns:", df_test.columns)
print("Predict columns:", df_predict.columns)

# Check for differences in column sets
assert set(df_train.columns) == set(df_test.columns), "Train and Test columns mismatch!"
assert set(df_train.columns) == set(df_predict.columns), "Train and Predict columns mismatch!"
print("All datasets have consistent columns!")


Train columns: Index(['timestamp', 'log_level', 'source', 'request_id', 'user_id',
       'project_id', 'internal_ip', 'request', 'status_code',
       'response_length', 'response_time', 'time_delta', 'is_error',
       'response_time_normalized'],
      dtype='object')
Test columns: Index(['timestamp', 'log_level', 'source', 'request_id', 'user_id',
       'project_id', 'internal_ip', 'request', 'status_code',
       'response_length', 'response_time', 'time_delta', 'is_error',
       'response_time_normalized'],
      dtype='object')
Predict columns: Index(['timestamp', 'log_level', 'source', 'request_id', 'user_id',
       'project_id', 'internal_ip', 'request', 'status_code',
       'response_length', 'response_time', 'time_delta', 'is_error',
       'response_time_normalized'],
      dtype='object')
All datasets have consistent columns!


In [16]:
# Numeric features to compare
numeric_features = ['response_time', 'response_length']

# Load datasets
datasets = {
    "Train": df_train,
    "Test": df_test,
    "Predict": df_predict
}

# Calculate summary statistics
for name, df in datasets.items():
    print(f"Statistics for {name} dataset:")
    print(df[numeric_features].describe())
    print("\n")


Statistics for Train dataset:
       response_time  response_length
count   18434.000000      9416.000000
mean        0.576138      1449.752761
std         2.998784      1378.649163
min         0.000000       116.000000
25%         0.000000       380.000000
50%         0.088699      1892.000000
75%         0.263785      1893.000000
max        51.590000     24904.000000


Statistics for Test dataset:
       response_time  response_length
count   52312.000000     26624.000000
mean        0.563563      1436.455341
std         2.898609      1398.752157
min         0.000000       116.000000
25%         0.000000       380.000000
50%         0.085623      1892.000000
75%         0.263155      1893.000000
max        22.910000     24904.000000


Statistics for Predict dataset:
       response_time  response_length
count  137074.000000     62693.000000
mean        0.508737      1439.021039
std         2.758057      1399.633460
min         0.000000       116.000000
25%         0.000000       380.