### Rego Policy Extractor

This notebook extracts Rego policy codes from the Red Hat Community of Practice rego-policies repository.

**Target URL:** https://github.com/redhat-cop/rego-policies/blob/main/POLICIES.md

In [3]:
# Step 1: Import required libraries
import requests
import re
import os
from pathlib import Path
import json
from bs4 import BeautifulSoup
import time


In [5]:
# Step 2: Fetch the GitHub page content
def fetch_github_page(url):
    """
    Fetch content from GitHub page using the raw content URL
    """
    # Convert GitHub blob URL to raw URL
    raw_url = url.replace('github.com', 'raw.githubusercontent.com').replace('/blob/', '/')

    
    try:
        # Add headers to mimic a browser request
        headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        
        # Try with SSL verification first
        try:
            response = requests.get(raw_url, headers=headers, timeout=30, verify=True)
            response.raise_for_status()
        except (OSError, requests.exceptions.SSLError) as ssl_error:
            response = requests.get(raw_url, headers=headers, timeout=30, verify=False)
            response.raise_for_status()
        return response.text
        
    except requests.exceptions.RequestException as e:
        return None

In [7]:
# Target URL
target_url = "https://github.com/redhat-cop/rego-policies/blob/main/POLICIES.md"

# Fetch the page content
page_content = fetch_github_page(target_url)

if page_content:
    print(page_content[:500])
else:
    print("Failed to fetch page content")



# Policies

## Violations

* [RHCOP\-COMBINE\-00001: Namespace has a NetworkPolicy](#rhcop-combine-00001-namespace-has-a-networkpolicy)
* [RHCOP\-COMBINE\-00002: Namespace has a ResourceQuota](#rhcop-combine-00002-namespace-has-a-resourcequota)
* [RHCOP\-OCP\_BESTPRACT\-00001: Common k8s labels are set](#rhcop-ocp_bestpract-00001-common-k8s-labels-are-set)
* [RHCOP\-OCP\_BESTPRACT\-00002: Container env has CONTAINER\_MAX\_MEMORY set](#rhcop-ocp_bestpract-00002-container-env-has-container_max_mem




In [8]:
# Step 3: Parse and extract Rego policy codes
def extract_rego_policies(content):
    """
    Extract Rego policy codes from markdown content
    """
    if not content:
        return []
    
    policies = []
    
    code_block_pattern = r'```(?:rego)?\s*\n(.*?)\n```'
    
    # Find all code blocks
    code_blocks = re.findall(code_block_pattern, content, re.DOTALL | re.IGNORECASE)
    
    
    for i, block in enumerate(code_blocks):
        # Clean up the block
        block = block.strip()
        
        policy_name = 'policy' + str(i)
        policies.append({
            'name': policy_name,
            'content': block,
            'index': i
        })
    
    return policies


In [17]:
# # Extract policies from the fetched content
# if page_content:
#     rego_policies = extract_rego_policies(page_content)
    
    
#     if rego_policies:
#         for i, policy in enumerate(rego_policies, 1):
#             print(f"{i}. {policy['name']}")
#     else:
#         print("No Rego policies found in the content")
# else:
#     print("Cannot extract policies - no content available")
#     rego_policies = []

In [None]:
rego_policies = extract_rego_policies(page_content)
rego_policies[0]['content']

'package combine.namespace_has_networkpolicy\n\nimport future.keywords.in\n\nimport data.lib.konstraint.core as konstraint_core\n\nviolation[msg] {\n  some manifests in input\n  some namespace in manifests\n\n  lower(namespace.apiVersion) == "v1"\n  lower(namespace.kind) == "namespace"\n\n  not _has_networkpolicy(manifests)\n\n  msg := konstraint_core.format_with_id(sprintf("%s/%s does not have a networking.k8s.io/v1:NetworkPolicy. See: https://docs.openshift.com/container-platform/4.6/networking/network_policy/about-network-policy.html", [namespace.kind, namespace.metadata.name]), "RHCOP-COMBINE-00001")\n}\n\n_has_networkpolicy(manifests) {\n  some current in manifests\n\n  lower(current.apiVersion) == "networking.k8s.io/v1"\n  lower(current.kind) == "networkpolicy"\n}'

In [16]:
# Create data directory if it doesn't exist
data_dir = Path("data")
data_dir.mkdir(exist_ok=True)

# Write each policy to a separate .rego file
for i, policy in enumerate(rego_policies):
    filename = f"policy_{i}.rego"
    filepath = data_dir / filename
    
    with open(filepath, 'w') as f:
        f.write(policy['content'])