# Ripgrep Contextual Search Example
This notebook demonstrates how to use ripgrep to search for patterns in files, including lines before and after each match, and store the results in a pandas DataFrame for contextual analysis.

In [1]:
# Import required libraries
import subprocess
import pandas as pd

## Define ripgrep search function with context
This function runs ripgrep with options to include lines before and after each match, and parses the output into a DataFrame.

In [None]:
import json
import os
from pathlib import Path
import datetime
def ripgrep_search_with_context(pattern, path='.', before=2, after=2, extra_args=None):
    cmd = ['rg', '--json', f'-B{before}', f'-A{after}', pattern, path]
    if extra_args:
        cmd.extend(extra_args)
    result = subprocess.run(cmd, capture_output=True, text=True)
    lines = result.stdout.strip().split('\n')
    search_path = Path(path).resolve()
    grouped = {}
    for line in lines:
        if not line.strip():
            continue
        try:
            obj = json.loads(line)
        except Exception:
            continue
        if obj.get('type') == 'begin':
            current_file = Path(obj['data']['path']['text']).resolve()
            current_group = []
        elif obj.get('type') in ['match', 'context']:
            current_group.append(obj)
        elif obj.get('type') == 'end':
            # Process group for this file
            # Find all match objects and their context
            for i, item in enumerate(current_group):
                if item.get('type') == 'match':
                    # Gather context before and after
                    context_lines = []
                    # before context
                    for j in range(i-before, i):
                        if 0 <= j < len(current_group) and current_group[j].get('type') == 'context':
                            context_lines.append(current_group[j]['data']['lines']['text'])
                    # matched line
                    match_text = item['data']['lines']['text']
                    context_lines.append(match_text)
                    # after context
                    for j in range(i+1, i+1+after):
                        if 0 <= j < len(current_group) and current_group[j].get('type') == 'context':
                            context_lines.append(current_group[j]['data']['lines']['text'])
                    file_path = Path(item['data']['path']['text']).resolve()
                    try:
                        stat = file_path.stat()
                        created = getattr(stat, 'st_birthtime', stat.st_ctime)
                        created = datetime.datetime.fromtimestamp(created)
                        modified = datetime.datetime.fromtimestamp(stat.st_mtime)
                    except Exception:
                        created = None
                        modified = None
                    try:
                        folder = str(file_path.parent.relative_to(search_path))
                    except ValueError:
                        folder = str(file_path.parent)
                    file_info = {
                        'file': str(file_path),
                        'search_path': str(search_path),
                        'folder': folder,
                        'file_name': file_path.name,
                        'file_ext': file_path.suffix,
                        'created': created,
                        'modified': modified
                    }
                    for submatch in item['data']['submatches']:
                        row = {
                            'line': item['data']['line_number'],
                            'col': submatch['start'],
                            'text': match_text,
                            'context': '\n'.join(context_lines),
                            'type': 'match'
                        }
                        row.update(file_info)
                        grouped.setdefault(str(file_path), []).append(row)
    # Flatten all match rows
    data = [row for rows in grouped.values() for row in rows]
    return pd.DataFrame(data)

## Run a contextual search
Search for the word 'def' in Python files, including 2 lines before and after each match, and display results.

In [28]:
df = ripgrep_search_with_context('def', path='C:/work/GitHub/dec-tree-py', before=2, after=2, extra_args=['--type', 'py'])
#df.info()
df

Unnamed: 0,line,col,text,type,file,search_path,folder,file_name,file_ext,created,modified
0,1,,from src.dec_tree.m_classifier import M_Classi...,context,C:\work\GitHub\dec-tree-py\Python\tests\m_clas...,C:\work\GitHub\dec-tree-py,Python\tests,m_classifier_test.py,.py,2024-08-16 13:24:38.139333,2024-08-16 13:37:56.382003
1,2,,\r\n,context,C:\work\GitHub\dec-tree-py\Python\tests\m_clas...,C:\work\GitHub\dec-tree-py,Python\tests,m_classifier_test.py,.py,2024-08-16 13:24:38.139333,2024-08-16 13:37:56.382003
2,3,0.0,def test_m_classifier():\r\n,match,C:\work\GitHub\dec-tree-py\Python\tests\m_clas...,C:\work\GitHub\dec-tree-py,Python\tests,m_classifier_test.py,.py,2024-08-16 13:24:38.139333,2024-08-16 13:37:56.382003
3,4,,m_classifier = M_Classifier()\r\n,context,C:\work\GitHub\dec-tree-py\Python\tests\m_clas...,C:\work\GitHub\dec-tree-py,Python\tests,m_classifier_test.py,.py,2024-08-16 13:24:38.139333,2024-08-16 13:37:56.382003
4,5,,assert m_classifier() < 1.0,context,C:\work\GitHub\dec-tree-py\Python\tests\m_clas...,C:\work\GitHub\dec-tree-py,Python\tests,m_classifier_test.py,.py,2024-08-16 13:24:38.139333,2024-08-16 13:37:56.382003
...,...,...,...,...,...,...,...,...,...,...,...
249,93,,plt.savefig(self.iset.getTryFolder() +...,context,C:\work\GitHub\dec-tree-py\Python\src\dec_tree...,C:\work\GitHub\dec-tree-py,Python\src\dec_tree,m_regressor.py,.py,2024-09-04 10:15:11.721226,2024-09-23 01:16:27.159902
250,94,,\r\n,context,C:\work\GitHub\dec-tree-py\Python\src\dec_tree...,C:\work\GitHub\dec-tree-py,Python\src\dec_tree,m_regressor.py,.py,2024-09-04 10:15:11.721226,2024-09-23 01:16:27.159902
251,95,4.0,def calc_trends(self):\r\n,match,C:\work\GitHub\dec-tree-py\Python\src\dec_tree...,C:\work\GitHub\dec-tree-py,Python\src\dec_tree,m_regressor.py,.py,2024-09-04 10:15:11.721226,2024-09-23 01:16:27.159902
252,96,,"""""""\r\n",context,C:\work\GitHub\dec-tree-py\Python\src\dec_tree...,C:\work\GitHub\dec-tree-py,Python\src\dec_tree,m_regressor.py,.py,2024-09-04 10:15:11.721226,2024-09-23 01:16:27.159902


In [20]:
# Debug: Print raw ripgrep output to help diagnose parsing issues
cmd = ['rg', '--vimgrep', '-B2', '-A2', 'def', 'C:/work/GitHub/dec-tree-py', '--type', 'py', '--json']
result = subprocess.run(cmd, capture_output=True, text=True)
print(result.stdout)

{"type":"begin","data":{"path":{"text":"C:/work/GitHub/dec-tree-py\\Python\\tests\\m_classifier_test.py"}}}
{"type":"context","data":{"path":{"text":"C:/work/GitHub/dec-tree-py\\Python\\tests\\m_classifier_test.py"},"lines":{"text":"from src.dec_tree.m_classifier import M_Classifier\r\n"},"line_number":1,"absolute_offset":0,"submatches":[]}}
{"type":"context","data":{"path":{"text":"C:/work/GitHub/dec-tree-py\\Python\\tests\\m_classifier_test.py"},"lines":{"text":"\r\n"},"line_number":2,"absolute_offset":52,"submatches":[]}}
{"type":"match","data":{"path":{"text":"C:/work/GitHub/dec-tree-py\\Python\\tests\\m_classifier_test.py"},"lines":{"text":"def test_m_classifier():\r\n"},"line_number":3,"absolute_offset":54,"submatches":[{"match":{"text":"def"},"start":0,"end":3}]}}
{"type":"context","data":{"path":{"text":"C:/work/GitHub/dec-tree-py\\Python\\tests\\m_classifier_test.py"},"lines":{"text":"    m_classifier = M_Classifier()\r\n"},"line_number":4,"absolute_offset":80,"submatches":[]}