In [1]:
import re
import os
import json
import pandas as pd

In [2]:
def extract_opcodes(file_path):
    """
    Extracts opcodes, bit ranges, and corresponding instruction names from the given file,
    handling comment lines, instruction lines, and pseudo-op lines.
    
    :param file_path: Path to the input file containing RISC-V opcodes and instructions.
    :return: A list of tuples with (instruction_name, bit_ranges_values, file_name).
    """
    opcode_data = []
    file_name = file_path.split("/")[-1]  # Get the file name from the path without directories

    # Regular expression to extract instruction name and bit ranges (supports both x..z=y and x=y)
    instruction_pattern = re.compile(r'(?P<instruction>\w+)\s+.*?(?P<bit_ranges>((\d+\.\.\d+|\d+)\s*=\s*(0x[0-9A-Fa-f]+|0b[01]+|\d+)\s*)+)')

    # Regular expression to match $pseudo_op lines
    pseudo_op_pattern = re.compile(r'\$pseudo_op\s+(?P<extension>[\w.]+)::(?P<base_instruction>\w+)\s+(?P<instruction>\w+)\s+(?P<args>.*?)\s+(?P<bit_ranges>((\d+\.\.\d+|\d+)\s*=\s*(0x[0-9A-Fa-f]+|0b[01]+|\d+)\s*)+)')

    # Open the file and read line by line
    with open(file_path, 'r') as file:
        for line in file:
            line = line.strip()

            # Skip comment lines
            if line.startswith('#') or not line:
                continue

            # Check for pseudo_op lines
            if line.startswith('$pseudo_op'):
                pseudo_match = pseudo_op_pattern.match(line)
                if pseudo_match:
                    instruction_name = pseudo_match.group('instruction')
                    bit_ranges_value = pseudo_match.group('bit_ranges')

                    # Split and format the bit ranges
                    bit_ranges = process_bit_ranges(bit_ranges_value)

                    opcode_data.append((str('pseudo_op-' + instruction_name), bit_ranges, file_name))
                continue  # Skip further processing for pseudo-op lines

            # Check for regular instruction lines
            instruction_match = instruction_pattern.match(line)
            if instruction_match:
                instruction_name = instruction_match.group('instruction')
                bit_ranges_value = instruction_match.group('bit_ranges')

                # Split and format the bit ranges
                bit_ranges = process_bit_ranges(bit_ranges_value)

                opcode_data.append((instruction_name, bit_ranges, file_name))

    return opcode_data


def process_bit_ranges(bit_ranges_value):
    """
    Processes bit range expressions in the format 'x..z=y', 'x=y', and 'x..z=0b010' or 'x..z=0x13'.
    
    :param bit_ranges_value: A string containing multiple bit range-value pairs.
    :return: A list of strings representing each bit range-value pair, preserving the format.
    """
    # Split the bit ranges by spaces
    bit_range_pairs = bit_ranges_value.strip().split()
    
    # Return the list of formatted bit range pairs as they are
    return bit_range_pairs


In [3]:
def display_opcodes(opcode_data):
    """
    Displays the extracted opcodes in a readable format.
    
    :param opcode_data: List of tuples with (instruction_name, bit_range, opcode)
    """
    print(f"{'Instruction':<12} {'Bit Range':<10} {'Opcode':<10} {'Extension':<10}")
    print('-' * 46)
    for instruction, bit_range, opcode, extension in opcode_data:
        print(f"{instruction:<12} {bit_range:<10} {opcode:<10} {extension}")

In [4]:
folder_path = './opcodes/'

# Initialize a list to store all opcode data
all_opcode_data = []

# Iterate through all files in the folder
for filename in os.listdir(folder_path):
    file_path = os.path.join(folder_path, filename)
    
    # Check if the current item is a file (ignore directories)
    if os.path.isfile(file_path):
        # Extract the part of the filename after 'rv'
        suffix = filename.split('rv', 1)[-1]
        
        # Extract opcodes and their corresponding instructions and bit ranges from the file
        opcode_data = extract_opcodes(file_path)
        
        # Modify each tuple to add the extracted suffix from the filename as a new column
        opcode_data_with_suffix = [(opcode[0], opcode[1], opcode[2], suffix) for opcode in opcode_data]
        
        # Append the modified opcode data to the combined list
        all_opcode_data.extend(opcode_data_with_suffix)

# Display the combined opcodes
print(all_opcode_data)
#pri
# nt(all_opcode_data)


[('pseudo_op-slli', ['31..25=0', '14..12=1', '6..2=0x04', '1..0=3'], 'rv32_i', '32_i'), ('pseudo_op-srli', ['31..25=0', '14..12=5', '6..2=0x04', '1..0=3'], 'rv32_i', '32_i'), ('pseudo_op-srai', ['31..25=32', '14..12=5', '6..2=0x04', '1..0=3'], 'rv32_i', '32_i'), ('pseudo_op-slli_rv32', ['31..25=0', '14..12=1', '6..2=0x04', '1..0=3'], 'rv32_i', '32_i'), ('pseudo_op-srli_rv32', ['31..25=0', '14..12=5', '6..2=0x04', '1..0=3'], 'rv32_i', '32_i'), ('pseudo_op-srai_rv32', ['31..25=32', '14..12=5', '6..2=0x04', '1..0=3'], 'rv32_i', '32_i'), ('pseudo_op-zip', ['31..25=4', '24..20=15', '14..12=1', '6..2=4', '1..0=3'], 'rv32_zbkb', '32_zbkb'), ('pseudo_op-unzip', ['31..25=4', '24..20=15', '14..12=5', '6..2=4', '1..0=3'], 'rv32_zbkb', '32_zbkb'), ('pseudo_op-rdcycleh', ['19..15=0', '31..20=0xC80', '14..12=2', '6..2=0x1C', '1..0=3'], 'rv32_zicntr', '32_zicntr'), ('pseudo_op-rdtimeh', ['19..15=0', '31..20=0xC81', '14..12=2', '6..2=0x1C', '1..0=3'], 'rv32_zicntr', '32_zicntr'), ('pseudo_op-rdinstret

In [5]:
from collections import defaultdict

def save_to_json(opcode_data, json_file_path):
    """
    Saves the extracted opcode data to a JSON file with unified instructions, combining
    the bit_range and opcode for the same instruction.
    
    :param opcode_data: List of tuples containing opcode information.
    :param json_file_path: Path to the JSON file where data will be saved.
    """
    # Use defaultdict to collect multiple bit_ranges and opcodes for each instruction
    unified_data = defaultdict(lambda: {"bit_ranges": [], "opcodes": [], "extension": "N/A"})

    for item in opcode_data:
        instruction = item[0]
        bit_range = item[1]
        opcode = item[2]
        extension = item[3]
        
        # Append bit_range and opcode to the existing instruction
        unified_data[instruction]["bit_ranges"].append(bit_range)
        unified_data[instruction]["opcodes"].append(opcode)
        unified_data[instruction]["extension"] = extension

    # Convert defaultdict to a regular list of dictionaries for JSON serialization
    formatted_data = [
        {
            "instruction": instruction,
            "bit_ranges": unified_data[instruction]["bit_ranges"],
            "extension": unified_data[instruction]["opcodes"]
        }
        for instruction in unified_data
    ]
    
    # Save to JSON
    with open(json_file_path, 'w') as json_file:
        json.dump(formatted_data, json_file, indent=4)

In [6]:
# Path to save the JSON file (replace with your desired file path)
json_file_path = 'output_opcodes.json'

# Save the extracted opcode data to a JSON file
save_to_json(all_opcode_data, json_file_path)

# Optional: Print confirmation message
print(f"Opcode data successfully saved to {json_file_path}")

Opcode data successfully saved to output_opcodes.json


In [7]:
cut_opcodes = [item[:3] for item in all_opcode_data]

In [8]:
cut_opcodes

[('pseudo_op-slli', ['31..25=0', '14..12=1', '6..2=0x04', '1..0=3'], 'rv32_i'),
 ('pseudo_op-srli', ['31..25=0', '14..12=5', '6..2=0x04', '1..0=3'], 'rv32_i'),
 ('pseudo_op-srai',
  ['31..25=32', '14..12=5', '6..2=0x04', '1..0=3'],
  'rv32_i'),
 ('pseudo_op-slli_rv32',
  ['31..25=0', '14..12=1', '6..2=0x04', '1..0=3'],
  'rv32_i'),
 ('pseudo_op-srli_rv32',
  ['31..25=0', '14..12=5', '6..2=0x04', '1..0=3'],
  'rv32_i'),
 ('pseudo_op-srai_rv32',
  ['31..25=32', '14..12=5', '6..2=0x04', '1..0=3'],
  'rv32_i'),
 ('pseudo_op-zip',
  ['31..25=4', '24..20=15', '14..12=1', '6..2=4', '1..0=3'],
  'rv32_zbkb'),
 ('pseudo_op-unzip',
  ['31..25=4', '24..20=15', '14..12=5', '6..2=4', '1..0=3'],
  'rv32_zbkb'),
 ('pseudo_op-rdcycleh',
  ['19..15=0', '31..20=0xC80', '14..12=2', '6..2=0x1C', '1..0=3'],
  'rv32_zicntr'),
 ('pseudo_op-rdtimeh',
  ['19..15=0', '31..20=0xC81', '14..12=2', '6..2=0x1C', '1..0=3'],
  'rv32_zicntr'),
 ('pseudo_op-rdinstreth',
  ['19..15=0', '31..20=0xC82', '14..12=2', '6..2=0

In [9]:
df = pd.DataFrame(data=cut_opcodes,columns=['instruction','opcodes','extension'])