# Dataset generation notebook for COBOL -BR JSON automation

## Import the required libraries

In [1]:
import subprocess

In [2]:
try:
    import pathtlib
except ImportError:
    print("Library not found. Installing...")
    try:
        subprocess.check_call(["pip", "install", "pathlib"])
        print("Library installed successfully.")
        import pathlib  # Now try importing again
    except Exception as e:
        print(f"Failed to install library: {e}")


Library not found. Installing...
Library installed successfully.


In [3]:
try:
    import textwrap
except ImportError:
    print("Library not found. Installing...")
    try:
        subprocess.check_call(["pip", "install", "textwrap"])
        print("Library installed successfully.")
        import textwrap  # Now try importing again
    except Exception as e:
        print(f"Failed to install library: {e}")

In [4]:
try:
    import google.generativeai as genai
except ImportError:
    print("Library not found. Installing...")
    try:
        subprocess.check_call(["pip", "install", "google.generativeai"])
        print("Library installed successfully.")
        import google.generativeai as genai  # Now try importing again
    except Exception as e:
        print(f"Failed to install library: {e}")

In [5]:
from IPython.display import display
from IPython.display import Markdown
import time
import json
import pandas as pd

In [6]:
!pip install transformers




[notice] A new release of pip is available: 23.3.2 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [7]:
from transformers import LlamaTokenizer

In [20]:
directory_path_CBL = "C:/Users/chira/OneDrive/Documents/Research Work/COBOL/COBREX_CLI/COBREX_CLI/COBOL_Files"
directory_path_COB = "C:/Users/chira/OneDrive/Documents/Research Work/COBOL/COBREX_CLI/COBREX_CLI/ALL_COB"

geminiApiKey = 'AIzaSyABQ7adBwnlHTLO8JgIh3w33kqihU5LjQY'

with open('codeSystemMessage.txt','r') as file:
    code_systemMessage = file.read()

code_systemMessage

'# you take a COBOL program and create Business Rules for it.\n# you are GREAT at extracting the business rules from all business variables in the code.\n# A business rules are defined as everyday business logics that get coded into the program.\n# There is a clear distinction between a business rule and a trivial validation. A business rule MAY CHANGE over time, but a trivial validation does not.\n# DO NOT INCLUDE ANYTHING OTHER THAN THE JSON in your response.\n# You  must generate multiple business rules from the given program.\n# mention the business rule id in your response.\n# your answer MUST be a valid json format, with each property of a business rule clearly listed.\n# any \'\\n\' in the text fields MUST be \'\\\\n\' so that when reading it later on, we won\'t run into any issues\n\n# example input and output with a simple COBOL file\n# Input:\n       IDENTIFICATION DIVISION.\n       PROGRAM-ID. EXAMPLE.\n       DATA DIVISION.\n       WORKING-STORAGE SECTION.\n       77  NUM  

## Get the paths of all CBL files

In [21]:
import glob

In [23]:
def get_cbl_files(directory):
    cbl_files = glob.glob(directory+ '/*/*.cbl', recursive=True)
    return cbl_files

cbl_files = get_cbl_files(directory_path_CBL)

len(cbl_files)

562

In [24]:
with open(cbl_files[4], 'r') as file:
    file_content = file.read()
print(file_content)

      $set sourceformat"free"
       program-id. CS00001S.
      *>    
      *>                        Gerenciador de Chamadas Assincronas
      *>
       environment division.
       configuration section.
            special-names. decimal-point is comma.      


       data division.      
      
       working-storage section.
       
       78   c-versao                               value "a".
       78   c-este-programa                        value "CS00001S".

       linkage section.
                 
       copy CSL00900.cpy.
                                                                          
       procedure division using lnk-par.
       

       0000-controle section.
            perform 1000-inicializacao
            perform 2000-processamento
            perform 3000-finalizacao.
       0000-saida.    
            exit program
            stop run
       exit.
       
       1000-inicializacao section.
       
       
       exit.
        
       2000-processament

## Count all tokens in a program

In [25]:
!pip install sentencepiece




[notice] A new release of pip is available: 23.3.2 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [26]:

file_content =""
for doc in cbl_files:
    with open(doc, 'r',encoding='latin-1') as file:
        file_content = file_content + file.read()

In [27]:
len(file_content)

6274080

In [49]:
genai.configure(api_key=geminiApiKey)
model = genai.GenerativeModel('gemini-pro')

def get_raw_text_gemini(file_content,systemMessage):
    response = model.generate_content(code_systemMessage+"\n\n"+file_content
                                    ,generation_config=genai.types.GenerationConfig(max_output_tokens=5000))
    # candidates=response.candidates[0].content.parts[0].text
#     print(response.prompt_feedback)
    return(response.text)

In [1]:
with open(cbl_files[7], 'r') as file:
    file_content = file.read()
    
# generate question and answer pairs
raw_response=get_raw_text_gemini(file_content,code_systemMessage)
raw_response

NameError: name 'cbl_files' is not defined

In [1]:
print(file_content)

NameError: name 'file_content' is not defined

In [36]:
print(raw_response)

{
  "id": "BR-001",
  "description": "If the user is not a master user, the program checks if the user has access to the selected module.",
  "condition": "not lnk-login-master",
  "output": {
    "access_granted": "set ws-usuario-acesso             to true",
    "access_denied": "set ws-usuario-sem-acesso              to true"
  }
}


In [47]:
cbl_files[133]

'C:/Users/chira/OneDrive/Documents/Research Work/COBOL/COBREX_CLI/COBREX_CLI/COBOL_Files\\cicsdev_cics-async-api-redbooks\\WEBHOME.cbl'

In [50]:
import os

def read_cobol_file(file_path):
    # Check file size
    file_size = os.path.getsize(file_path)
    if file_size < 10240:  # 10KB in bytes
        with open(file_path, 'r', encoding='latin-1') as file:
            return file.read()
    else:
        return None  # Skip files larger than 10KB

data_list = []
i = 1
# Iterate through files in the directory
for filename in cbl_files:
    cobol_code = read_cobol_file(filename)
    if cobol_code is not None:
        # Generate JSON output
        br_json = get_raw_text_gemini(cobol_code, code_systemMessage)  
        # Append data to the list
        data_list.append({'Cobol_code': cobol_code, 'BR_json': br_json})
        print(f"Done with {i} files out of {len(cbl_files)}")
        i += 1
    else:
        print(f"Skipped {filename} as it exceeds 10KB")

# Create DataFrame
df = pd.DataFrame(data_list)

# Save DataFrame to CSV
df.to_csv('dataset_cbl.csv', index=False)


Skipped C:/Users/chira/OneDrive/Documents/Research Work/COBOL/COBREX_CLI/COBREX_CLI/COBOL_Files\abrignoli_COBSOFT\CS00000F.cbl as it exceeds 10KB
Done with 1 files out of 562
Done with 2 files out of 562
Done with 3 files out of 562
Done with 4 files out of 562
Done with 5 files out of 562
Done with 6 files out of 562
Done with 7 files out of 562
Done with 8 files out of 562
Done with 9 files out of 562
Skipped C:/Users/chira/OneDrive/Documents/Research Work/COBOL/COBREX_CLI/COBREX_CLI/COBOL_Files\abrignoli_COBSOFT\CS00005S.cbl as it exceeds 10KB
Done with 10 files out of 562
Done with 11 files out of 562
Done with 12 files out of 562
Skipped C:/Users/chira/OneDrive/Documents/Research Work/COBOL/COBREX_CLI/COBREX_CLI/COBOL_Files\abrignoli_COBSOFT\CS00103S.cbl as it exceeds 10KB
Done with 13 files out of 562
Done with 14 files out of 562
Skipped C:/Users/chira/OneDrive/Documents/Research Work/COBOL/COBREX_CLI/COBREX_CLI/COBOL_Files\abrignoli_COBSOFT\CS00106S.cbl as it exceeds 10KB
Skipp

Done with 52 files out of 562
Done with 53 files out of 562
Done with 54 files out of 562
Done with 55 files out of 562
Done with 56 files out of 562
Done with 57 files out of 562
Done with 58 files out of 562
Done with 59 files out of 562
Done with 60 files out of 562
Done with 61 files out of 562
Done with 62 files out of 562
Done with 63 files out of 562
Done with 64 files out of 562
Done with 65 files out of 562
Done with 66 files out of 562
Done with 67 files out of 562
Done with 68 files out of 562
Done with 69 files out of 562
Done with 70 files out of 562
Done with 71 files out of 562
Done with 72 files out of 562
Done with 73 files out of 562
Done with 74 files out of 562
Done with 75 files out of 562
Skipped C:/Users/chira/OneDrive/Documents/Research Work/COBOL/COBREX_CLI/COBREX_CLI/COBOL_Files\cicsdev_cics-async-api-credit-card-application-example\ASYNCPNT.cbl as it exceeds 10KB
Done with 76 files out of 562
Done with 77 files out of 562
Done with 78 files out of 562
Done wi

Done with 183 files out of 562
Done with 184 files out of 562
Done with 185 files out of 562
Skipped C:/Users/chira/OneDrive/Documents/Research Work/COBOL/COBREX_CLI/COBREX_CLI/COBOL_Files\IBM_example-health-apis\HCMRESTW.cbl as it exceeds 10KB
Done with 186 files out of 562
Done with 187 files out of 562
Done with 188 files out of 562
Done with 189 files out of 562
Done with 190 files out of 562
Done with 191 files out of 562
Done with 192 files out of 562
Done with 193 files out of 562
Done with 194 files out of 562
Done with 195 files out of 562
Done with 196 files out of 562
Done with 197 files out of 562
Skipped C:/Users/chira/OneDrive/Documents/Research Work/COBOL/COBREX_CLI/COBREX_CLI/COBOL_Files\IBM_idz-utilities\GAM0VDB.cbl as it exceeds 10KB
Skipped C:/Users/chira/OneDrive/Documents/Research Work/COBOL/COBREX_CLI/COBREX_CLI/COBOL_Files\IBM_idz-utilities\GAM0VII.cbl as it exceeds 10KB
Skipped C:/Users/chira/OneDrive/Documents/Research Work/COBOL/COBREX_CLI/COBREX_CLI/COBOL_Fil

Done with 341 files out of 562
Done with 342 files out of 562
Skipped C:/Users/chira/OneDrive/Documents/Research Work/COBOL/COBREX_CLI/COBREX_CLI/COBOL_Files\shamrice_COBOL-RSS-Reader\rss_reader_help.cbl as it exceeds 10KB
Skipped C:/Users/chira/OneDrive/Documents/Research Work/COBOL/COBREX_CLI/COBREX_CLI/COBOL_Files\shamrice_COBOL-RSS-Reader\rss_reader_menu.cbl as it exceeds 10KB
Done with 343 files out of 562
Done with 344 files out of 562
Skipped C:/Users/chira/OneDrive/Documents/Research Work/COBOL/COBREX_CLI/COBREX_CLI/COBOL_Files\shamrice_COBOL-RSS-Reader\rss_report_writer.cbl as it exceeds 10KB
Done with 345 files out of 562
Skipped C:/Users/chira/OneDrive/Documents/Research Work/COBOL/COBREX_CLI/COBREX_CLI/COBOL_Files\ShaunLawrie_TicTacTOBOL\TicTacTOBOL.cbl as it exceeds 10KB
Done with 346 files out of 562
Done with 347 files out of 562
Skipped C:/Users/chira/OneDrive/Documents/Research Work/COBOL/COBREX_CLI/COBREX_CLI/COBOL_Files\thinkinglabs_cobol-fizzbuzz-kata\ZUTZCPC.CBL as

Done with 418 files out of 562
Skipped C:/Users/chira/OneDrive/Documents/Research Work/COBOL/COBREX_CLI/COBREX_CLI/COBOL_Files\z390development_z390\U1.CBL as it exceeds 10KB
Skipped C:/Users/chira/OneDrive/Documents/Research Work/COBOL/COBREX_CLI/COBREX_CLI/COBOL_Files\z390development_z390\U2.CBL as it exceeds 10KB
Skipped C:/Users/chira/OneDrive/Documents/Research Work/COBOL/COBREX_CLI/COBREX_CLI/COBOL_Files\z390development_z390\U4.CBL as it exceeds 10KB
Skipped C:/Users/chira/OneDrive/Documents/Research Work/COBOL/COBREX_CLI/COBREX_CLI/COBOL_Files\z390development_z390\U6.CBL as it exceeds 10KB
Skipped C:/Users/chira/OneDrive/Documents/Research Work/COBOL/COBREX_CLI/COBREX_CLI/COBOL_Files\z390development_z390\U6B.CBL as it exceeds 10KB
Skipped C:/Users/chira/OneDrive/Documents/Research Work/COBOL/COBREX_CLI/COBREX_CLI/COBOL_Files\z390development_z390\UPROGRAM.CBL as it exceeds 10KB
Done with 419 files out of 562
Skipped C:/Users/chira/OneDrive/Documents/Research Work/COBOL/COBREX_CLI/