## String splitter

In [None]:
import re
import numpy as np
import json

def split(delimiters, string, maxsplit=0):    #simple method that takes a list of delimiters, the string to be split, and an optional max amount of splits and splits the string accordingly
    regex_pattern = '|'.join(map(re.escape, delimiters))
    return re.split(regex_pattern, string, maxsplit)

## File opener

## If, case, and always statement extractor

In [None]:
import numpy as np
import re

def ifCaseFinder(currentFile):
    ifCaseFound = 0         # counter for if/case nesting
    alwaysCheck = 0         # checking if synchronous
    currentLine = 0         # counter for current line
    elseFound = 0           # checking if next line is an else statement
    multiLine = False
    fileString = ""         # stores the current if/case block text
    syncArgText = ""        # arguments of always statement
    ifCaseArgText = ""      # arguments of if/case statement
    variables = ([],[])     # output/input, variable (unchanged in this version)
    fileStrings = ([], [], [], [])  # asynchronous/synchronous, sync args, if/case args, full block text

    while currentLine < len(currentFile):
        # Look for the start of an if/case block
        while ifCaseFound == 0 and currentLine < len(currentFile):
            currentLineText = currentFile[currentLine].split("//", 1)[0]
            
            # Check for an always block indicator (@) and extract its arguments
            if "@" in currentLineText and "*" not in currentLineText:
                m = re.search(r'\((.*?)\)', currentLineText)
                if m:
                    syncArgText = m.group(1)
                alwaysCheck = 1

            # Check for leaving an always block
            if "\tend" in currentLineText or "\nend" in currentLineText or " end" in currentLineText:
                syncArgText = ""
                alwaysCheck = 0

            # Check for an if or case statement (various spacing variations)
            if ((" if " in currentLineText or " if(" in currentLineText or "\tif " in currentLineText or "\nif " in currentLineText) or
                (" case" in currentLineText or "\tcase" in currentLineText or "\ncase" in currentLineText)):
                m = re.search(r'\((.*?)\)', currentLineText)
                if m:
                    ifCaseArgText = m.group(1)
                fileString += currentLineText  # start accumulating the block text
                ifCaseFound += 1
            currentLine += 1

        if currentLine >= len(currentFile):
            break

        # Process the if/case block until its end
        while (ifCaseFound != 0 or elseFound == 1) and currentLine < len(currentFile):
            elseFound = 0
            currentLineText = currentFile[currentLine].split("//", 1)[0]

            # Handle multi-line comments
            if multiLine:
                if "*/" in currentLineText:
                    multiLine = False
                currentLine += 1
                continue

            if "/*" in currentLineText:
                multiLine = True
                currentLineText = currentLineText.split("/*", 1)[0]

            # Ensure line ends with a newline character
            if "\n" not in currentLineText:
                currentLineText += "\n"
            fileString += currentLineText

            # Increase/decrease nesting based on keywords
            if "begin" in currentLineText or "case" in currentLineText:
                ifCaseFound += 1
            if ("end" in currentLineText or "endcase" in currentLineText):
                ifCaseFound -= 1

            currentLine += 1

            # If the block ended and the next line is an "else", include it in the same block
            if currentLine < len(currentFile) and ifCaseFound == 0:
                nextLine = currentFile[currentLine].split("//", 1)[0]
                if "else " in nextLine:
                    elseFound = 1

        # Store the collected data
        fileStrings[0].append(alwaysCheck)
        fileStrings[1].append(syncArgText)
        fileStrings[2].append(ifCaseArgText)
        fileStrings[3].append(fileString)
        fileString = ""  # reset for next block

    variables = np.array(variables)
    fileStrings = np.array(fileStrings)
    return [variables, fileStrings]


In [None]:
def readFile(file_name):
    file_object = open(file_name, 'r')    #open the requested file on "read"
    currentFile = file_object.readlines()             #turn file into an array of strings (one per line)
    fileStrings = ifCaseFinder(currentFile)     #call ifCaseFinder and store the array returned in fileStrings
    file_object.close()                               #close the file
    return fileStrings                          #return the array fileStrings

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
gpu_id = "cuda:0"
base_model = "deepseek-ai/deepseek-coder-6.7b-instruct"


device = torch.device(gpu_id)
torch.cuda.set_device(device)
tokenizer = AutoTokenizer.from_pretrained("./Models/DeepSeek-6-7b-instruct-Verilog-sync", trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained("deepseek-ai/deepseek-coder-6.7b-instruct", trust_remote_code=True).to(device)

model = AutoModelForCausalLM.from_pretrained(
    base_model,
    torch_dtype=torch.float16,
    device_map= gpu_id
)

adapter_name = model.load_adapter("./Models/DeepSeek-6-7b-instruct-Verilog-sync")
model.active_adapters = adapter_name

In [None]:
from transformers import StoppingCriteria, StoppingCriteriaList

class StoppingCriteriaSub(StoppingCriteria):

    def __init__(self, stops = [], encounters=1):
        super().__init__()
        self.stops = [stop.to("cuda:0") for stop in stops]

    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor):
        last_token = input_ids[0][-1] # the generated token
        for stop in self.stops: # stop words
            if tokenizer.decode(stop[-1]) == tokenizer.decode(last_token): # if the generated token is in the stop words
                # print ("HIT")
                # print(tokenizer.decode(stop[-1]))
                return True # stop the generation
        return False


stop_words = ["EOT"]
stop_words_ids = [tokenizer(stop_word, return_tensors='pt')['input_ids'].squeeze() for stop_word in stop_words]
print(stop_words_ids)
stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub(stops=stop_words_ids)])

## Files to be scanned

In [None]:
import os
namlist = [""]

for name in namlist:
    filedirectory = "./RTL_deepseek/"+ name         #directory of the files
    #filedirectory = "./TEST"         #directory of the files
    filesToOpen = []                    #array to hold all the files
    # in a loop read all the files and store the data in an array

    for file_name in os.listdir(filedirectory):    #loop through all files in the directory
        if file_name.endswith(".sv") or file_name.endswith(".v"):                #only open .sv files
            filesToOpen.append(filedirectory + "/" + file_name)    #add the files to the array

    print("Files being asserted", filesToOpen)
    for file_name in filesToOpen:
        ifElseCaseArray = []                #array to hold each file's data separately
        print(file_name) # print the file name
        ifElseCaseArray = readFile(file_name)    #call readFile and store the data in ifElseCaseArray
        clk_condition = 0
        with open(file_name + ".txt", "w") as f:
            for code in ifElseCaseArray[1][3]:
                if(len(ifElseCaseArray[1][1][clk_condition]) != 0) and  len(code) < 500: # Sunchronous
                    PromptString = ""
                    PromptString += "You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer. \n### Instruction:"
                    PromptString += "Generate a list of synchronous systemverilog assertion executing at "
                    PromptString += str(' '.join(ifElseCaseArray[1][1][clk_condition].split()[:2]))   
                    PromptString += " from the following code \n"
                    PromptString += str(code)
                    PromptString += "\n ### Response: \n\n"
                    print(PromptString)
                    input_text = PromptString
                    inputs = tokenizer(input_text, return_tensors="pt").to(model.device)
                    outputs = model.generate(**inputs, max_length=5000,stopping_criteria=stopping_criteria, 
                                            eos_token_id=tokenizer.eos_token_id, pad_token_id=tokenizer.pad_token_id)
                    output = tokenizer.decode(outputs[0],  # remove the token the LLM stops at
                                        skip_special_tokens=True)
                    print(output[:-1])
                    f.write(output[:-1])
                elif len(ifElseCaseArray[1][1][clk_condition]) == 0 and  len(code) < 500: # Asynchronous
                    PromptString = ""
                    PromptString += "You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer. \n### Instruction:"
                    PromptString += "Generate a list of asynchronous systemverilog assertion"
                    PromptString += " from the following code \n"
                    PromptString += str(code)
                    PromptString += "\n ### Response: \n\n"
                    print(PromptString)
                    input_text = PromptString
                    inputs = tokenizer(input_text, return_tensors="pt").to(model.device)
                    outputs = model.generate(**inputs, max_length=5000,stopping_criteria=stopping_criteria, 
                                            eos_token_id=tokenizer.eos_token_id, pad_token_id=tokenizer.pad_token_id)
                    #output = tokenizer.decode(outputs[0][7:-2],  # remove the token the LLM stops at
                    #                    skip_special_tokens=True)[len(input_text):]
                    output = tokenizer.decode(outputs[0],  # remove the token the LLM stops at
                                        skip_special_tokens=True)
                    print(output[:-1])
                    f.write(output[:-1])
                clk_condition += 1
        
    # make a directory to store the files
    foldername = filedirectory + "/" + name + "_assertions"
    if not os.path.exists(foldername):
        os.makedirs(foldername)

    os.system("mv " + filedirectory + "/*.txt " + foldername) # move all the files to the directory

## Printing the extracted statements

In [None]:
exit