In [28]:
import re
import pandas as pd
from io import StringIO
pd.set_option('display.max_colwidth', None)

In [29]:
# parameters
base_input_log = "/home/jovyan/run_logs/b.log"   
delta_input_log = "/home/jovyan/run_logs/delta.log"

In [30]:
# --- Define a Function for Parsing the log_line column into key-value pairs ---

def parse_log_line(line):
    """
    Parses a log line and extracts the key-value pairs.
    """
    # Regex to capture the base info (datetime, etc.) and the key-value pairs.
    # It handles both quoted and unquoted values.
    # The pattern matches: key=value or key='value'
    pattern = re.compile(r"(\w+)\s*=\s*(?:'([^']*)'|(\S+))")
    
    # Extract all key-value pairs from the log string
    kv_matches = pattern.findall(line)
    
    # Create a dictionary, handling the nested tuple from the regex
    kv_dict = {key: val1 if val1 else val2 for key, val1, val2 in kv_matches}
    
    # Extract the initial static log information
    try:
        # Split the string up to the first key-value pair
        base_info_str = re.split(r'\s\w+=', line, 1)[0]
        base_info_parts = base_info_str.split()
        kv_dict['timestamp'] = f"{base_info_parts[0]} {base_info_parts[1]}"
        kv_dict['level'] = base_info_parts[2]
        kv_dict['logger'] = base_info_parts[3].replace(":", "")
    except (IndexError, AttributeError):
        pass # Handle cases with different log formats gracefully
        
    return kv_dict

In [31]:
def to_Dataframes(jdbc_input_log):
    # read file one line per row putting it into the log_line column of a DataFrame
    base = pd.read_csv(jdbc_input_log, sep="\t", header=None, names=["log_line"], dtype=str)

    # extract the jdbc response and the response rows into separate DataFrames each containing one row named log_line

    # rows that do NOT contain 'rownumber' are the jdbc response
    base_responses = base[~base["log_line"].str.contains("rownumber", na=False, regex=False)]

    # rows that contain the literal 'rownumber' (case-sensitive) are the jdbc data rows
    base_rows = base[base["log_line"].str.contains("rownumber", na=False, regex=False)]

    # Apply the parsing function to each row of the 'log_line' column.
    # The result will be a new DataFrame
    base_expanded_responses = pd.DataFrame((base_responses['log_line'].apply(parse_log_line)).tolist())

    # Clean up column dtypes
    for col in ['gatlingSessionId', 'start', 'end', 'duration', 'rows']:
        if col in base_expanded_responses.columns:
            base_expanded_responses[col] = pd.to_numeric(base_expanded_responses[col], errors='coerce')

    base_expanded_rows = pd.DataFrame((base_rows['log_line'].apply(parse_log_line)).tolist())

    for col in ['gatlingSessionId', 'rownumber']:
        if col in base_expanded_rows.columns:
            base_expanded_rows[col] = pd.to_numeric(base_expanded_rows[col], errors='coerce')

    response_sort_cols = ['gatlingRunId','status','gatlingSessionId','model','inboundTextAsMd5Hash']
    row_sort_cols = ['gatlingRunId','status','gatlingSessionId','model','inboundTextAsMd5Hash']

    return base_expanded_responses.sort_values(by=response_sort_cols).reset_index(drop=True), \
    base_expanded_rows.sort_values(by=row_sort_cols).reset_index(drop=True)


In [32]:
base_responses, base_rows = to_Dataframes(base_input_log)
delta_responses, delta_rows = to_Dataframes(delta_input_log)


In [33]:
def row_level_diff(df_old: pd.DataFrame, df_new: pd.DataFrame) -> dict:
    # ensure same columns and column order for literal row equality
    common_cols = sorted(set(df_old.columns).union(df_new.columns))
    a = df_old.reindex(columns=common_cols)
    b = df_new.reindex(columns=common_cols)

    merged = a.merge(b, how='outer', indicator=True)
    added = merged[merged['_merge'] == 'right_only'].drop(columns=['_merge'])
    removed = merged[merged['_merge'] == 'left_only'].drop(columns=['_merge'])
    both = merged[merged['_merge'] == 'both'].drop(columns=['_merge'])

    return {'added': added, 'removed': removed, 'common': both}

In [34]:
results = row_level_diff(base_responses, delta_responses)

In [35]:
results['added']

Unnamed: 0,duration,end,gatlingRunId,gatlingSessionId,inboundTextAsMd5Hash,level,logger,model,queryName,rows,start,status,timestamp
70,1039,1761263877205,Gimme Shelter,1,be54cac66a736a56b5c57653badbb063,INFO,SqlLogger,internet_sales,Query 1,159,1761263876174,SUCCEEDED,2025-10-23 16:57:57
71,1031,1761263877205,Gimme Shelter,1,be54cac66a736a56b5c57653badbb069,INFO,SqlLogger,internet_sales,Query 100,0,1761263876174,SUCCEEDED,2025-10-23 16:57:57


In [36]:
results['removed']

Unnamed: 0,duration,end,gatlingRunId,gatlingSessionId,inboundTextAsMd5Hash,level,logger,model,queryName,rows,start,status,timestamp
10,1031,1761263877205,Gimme Shelter,1,be54cac66a736a56b5c57653badbb063,INFO,SqlLogger,internet_sales,Query 1,158,1761263876174,SUCCEEDED,2025-10-23 16:57:57


In [37]:
from typing import Optional, List

def cellular_diff(df_old: pd.DataFrame, df_new: pd.DataFrame, key: Optional[List[str]] = None):
    """
    Return a dict with 'added', 'removed', 'modified', and 'cell_diffs' (if possible).
    - If key provided: modifications detected by key.
    - If key not provided: treat whole-row equality.
    """
    if key:
        result = key_based_diff(df_old, df_new, key)
        # try to build cell-level diffs for modified keys if index alignment possible
        try:
            old_idxed = df_old.set_index(key).loc[result['modified']['index'].unique()] if not result['modified'].empty else pd.DataFrame()
            new_idxed = df_new.set_index(key).loc[result['modified']['index'].unique()] if not result['modified'].empty else pd.DataFrame()
            if not old_idxed.empty and not new_idxed.empty:
                # align and compare
                old_idxed = old_idxed.sort_index()
                new_idxed = new_idxed.sort_index()
                cell_diff = old_idxed.compare(new_idxed)
            else:
                cell_diff = pd.DataFrame()
        except Exception:
            cell_diff = pd.DataFrame()
        result['cell_diffs'] = cell_diff
        return result
    else:
        rows = row_level_diff(df_old, df_new)
        # try cell-level compare if both have same index/columns and size
        try:
            a = df_old.sort_index(axis=1).reset_index(drop=True)
            b = df_new.reindex(columns=a.columns).sort_index(axis=1).reset_index(drop=True)
            if a.shape == b.shape:
                cell_diff = a.compare(b)
            else:
                cell_diff = pd.DataFrame()
        except Exception:
            cell_diff = pd.DataFrame()
        return {'added': rows['added'], 'removed': rows['removed'], 'cell_diffs': cell_diff}


In [38]:
cell_results = cellular_diff(base_responses, delta_responses)

In [39]:
cell_results['added']

Unnamed: 0,duration,end,gatlingRunId,gatlingSessionId,inboundTextAsMd5Hash,level,logger,model,queryName,rows,start,status,timestamp
70,1039,1761263877205,Gimme Shelter,1,be54cac66a736a56b5c57653badbb063,INFO,SqlLogger,internet_sales,Query 1,159,1761263876174,SUCCEEDED,2025-10-23 16:57:57
71,1031,1761263877205,Gimme Shelter,1,be54cac66a736a56b5c57653badbb069,INFO,SqlLogger,internet_sales,Query 100,0,1761263876174,SUCCEEDED,2025-10-23 16:57:57


In [40]:
cell_results['removed']

Unnamed: 0,duration,end,gatlingRunId,gatlingSessionId,inboundTextAsMd5Hash,level,logger,model,queryName,rows,start,status,timestamp
10,1031,1761263877205,Gimme Shelter,1,be54cac66a736a56b5c57653badbb063,INFO,SqlLogger,internet_sales,Query 1,158,1761263876174,SUCCEEDED,2025-10-23 16:57:57


In [41]:
cell_results['cell_diffs']