# Dataset generation notebook for COBOL -BR JSON automation

## Import the required libraries

In [1]:
import subprocess

In [2]:
try:
    import pathtlib
except ImportError:
    print("Library not found. Installing...")
    try:
        subprocess.check_call(["pip", "install", "pathlib"])
        print("Library installed successfully.")
        import pathlib  # Now try importing again
    except Exception as e:
        print(f"Failed to install library: {e}")


Library not found. Installing...
Library installed successfully.


In [3]:
try:
    import textwrap
except ImportError:
    print("Library not found. Installing...")
    try:
        subprocess.check_call(["pip", "install", "textwrap"])
        print("Library installed successfully.")
        import textwrap  # Now try importing again
    except Exception as e:
        print(f"Failed to install library: {e}")

In [4]:
try:
    import google.generativeai as genai
except ImportError:
    print("Library not found. Installing...")
    try:
        subprocess.check_call(["pip", "install", "google.generativeai"])
        print("Library installed successfully.")
        import google.generativeai as genai  # Now try importing again
    except Exception as e:
        print(f"Failed to install library: {e}")

In [5]:
from IPython.display import display
from IPython.display import Markdown
import time
import json
import pandas as pd

In [6]:
!pip install transformers





In [7]:
from transformers import LlamaTokenizer

In [8]:
directory_path_CBL = "./Test_COBOLs/InputFiles"
output_directory = "./Test_COBOLs/BR_JSON"

geminiApiKey = ''

with open('codeSystemMessage.txt','r') as file:
    code_systemMessage = file.read()

code_systemMessage

'# you take a COBOL program and create Business Rules for it.\n# you are GREAT at extracting the business rules from all business variables in the code.\n# A business rules are defined as everyday business logics that get coded into the program.\n# There is a clear distinction between a business rule and a trivial validation. A business rule MAY CHANGE over time, but a trivial validation does not.\n# DO NOT INCLUDE ANYTHING OTHER THAN THE JSON in your response.\n# Generation of multiple Business Rules from a COBOL file is preferable.\n# mention the business rule id in your response.\n# your answer MUST be a valid json format, with each property of a business rule clearly listed.\n# any \'\\n\' in the text fields MUST be \'\\\\n\' so that when reading it later on, we won\'t run into any issues\n\n# example input and output with a simple COBOL file\n# Input:\n       IDENTIFICATION DIVISION.\n       PROGRAM-ID. EXAMPLE.\n       DATA DIVISION.\n       WORKING-STORAGE SECTION.\n       77  N

## Get the paths of all CBL files

In [9]:
import glob

In [10]:
def get_cbl_files(directory):
    cbl_files = glob.glob(directory+ '/*.cbl', recursive=True)
    return cbl_files

cbl_files = get_cbl_files(directory_path_CBL)

len(cbl_files)

12

In [11]:
with open(cbl_files[4], 'r') as file:
    file_content = file.read()
print(file_content)

       IDENTIFICATION DIVISION.
       PROGRAM-ID.  CWBWDATE.
       ENVIRONMENT DIVISION.
       DATA DIVISION.
       WORKING-STORAGE SECTION.
       01  DATE-TABLE.
           05  FILLER                 PIC 9(4)    VALUE 0131.
           05  FILLER                 PIC 9(4)    VALUE 0228.
           05  FILLER                 PIC 9(4)    VALUE 0331.
           05  FILLER                 PIC 9(4)    VALUE 0430.
           05  FILLER                 PIC 9(4)    VALUE 0531.
           05  FILLER                 PIC 9(4)    VALUE 0630.
           05  FILLER                 PIC 9(4)    VALUE 0731.
           05  FILLER                 PIC 9(4)    VALUE 0831.
           05  FILLER                 PIC 9(4)    VALUE 0930.
           05  FILLER                 PIC 9(4)    VALUE 1031.
           05  FILLER                 PIC 9(4)    VALUE 1130.
           05  FILLER                 PIC 9(4)    VALUE 1231.
       01  DATE-TABLE-REDEFINED REDEFINES DATE-TABLE.
           05  DATE-FIELDS OCCURS 

## Count all tokens in a program

In [25]:
!pip install sentencepiece




[notice] A new release of pip is available: 23.3.2 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [26]:

file_content =""
for doc in cbl_files:
    with open(doc, 'r',encoding='latin-1') as file:
        file_content = file_content + file.read()

In [27]:
len(file_content)

6274080

In [12]:
genai.configure(api_key=geminiApiKey)
model = genai.GenerativeModel('gemini-pro')

def get_raw_text_gemini(file_content,systemMessage):
    response = model.generate_content(code_systemMessage+"\n\n"+file_content
                                    ,generation_config=genai.types.GenerationConfig(max_output_tokens=5000))
    # candidates=response.candidates[0].content.parts[0].text
#     print(response.prompt_feedback)
    return(response.text)

In [15]:
with open(cbl_files[7], 'r') as file:
    file_content = file.read()
    
# generate question and answer pairs
raw_response=get_raw_text_gemini(file_content,code_systemMessage)
raw_response

'{\n  "id": "BR-001",\n  "description": "The strategy for bidding on a card in the game is determined by the player\'s strategy code.",\n  "condition": "PLAYER-BID = 0",\n  "output": {}\n}'

In [16]:
print(raw_response)

{
  "id": "BR-001",
  "description": "The strategy for bidding on a card in the game is determined by the player's strategy code.",
  "condition": "PLAYER-BID = 0",
  "output": {}
}


In [17]:
import os

def read_cobol_file(file_path):
    # Check file size
    file_size = os.path.getsize(file_path)
    with open(file_path, 'r', encoding='latin-1') as file:
        return file.read()

i = 1
# Iterate through files in the directory
for filename in cbl_files:
    cobol_code = read_cobol_file(filename)
    # if cobol_code is not None:
    # Generate Markdown Output
    mark_output = get_raw_text_gemini(cobol_code, code_systemMessage)
    
    # get the output path name
    base, ext = os.path.splitext(filename)
    filename_without_extension = os.path.basename(base)
    output_filename = output_directory + "/Out_" + filename_without_extension + ".json"
    with open(output_filename,'w') as f:
    # write to the output file
        f.write(mark_output)
    print(f"Done with {i} files out of {len(cbl_files)}")
    i += 1

Done with 1 files out of 12
Done with 2 files out of 12
Done with 3 files out of 12
Done with 4 files out of 12
Done with 5 files out of 12
Done with 6 files out of 12
Done with 7 files out of 12
Done with 8 files out of 12
Done with 9 files out of 12
Done with 10 files out of 12
Done with 11 files out of 12
Done with 12 files out of 12
