In [None]:
from __future__ import annotations
import json
from pathlib import Path
from typing import Iterable, Dict, Any, Optional
from itertools import islice

BASE_DIR = Path("/Users/chunghyunhan/Desktop/Columbia Folder/25-Fall/Advanced Topics in IEOR_Agentic AI and Data Economy/FSM-Fine-Tuning-Dataset")
FILE_BASENAMES = ["comment_code", "requirement_fsm_code"]

def resolve_jsonl(path: Path) -> Path:
    if path.is_file():
        return path
    if path.with_suffix(".jsonl").is_file():
        return path.with_suffix(".jsonl")
    raise FileNotFoundError(f"Could not find the file: {path} / {path.with_suffix('.jsonl')}")

def read_jsonl(path: Path, *, strict: bool=False) -> Iterable[Dict[str, Any]]:
    p = resolve_jsonl(path)
    with p.open("r", encoding="utf-8") as f:
        for i, line in enumerate(f, start=1):
            line = line.strip()
            if not line:
                continue
            try:
                yield json.loads(line)
            except json.JSONDecodeError as e:
                if strict:
                    raise
                print(f"[warn] {p.name}:{i} JSONDecodeError: {e}")
                continue

def head(iterable, n: int):
    return list(islice(iterable, n))

def preview_file(basename: str, n_preview: int = 3):
    path = BASE_DIR / basename
    rows = head(read_jsonl(path), n_preview)
    print(f"\n=== {basename} (preview {len(rows)} rows) ===")
    for idx, row in enumerate(rows, 1):
        # Truncate long values so only part of each field is shown
        keys = list(row.keys())
        show = {k: (str(row[k])[:200] + ("..." if len(str(row[k])) > 200 else "")) for k in keys}
        print(f"[{idx}] keys={keys}")
        print(show)

def count_rows(basename: str, max_errors: int = 10) -> int:
    path = BASE_DIR / basename
    cnt = 0
    err = 0
    for _ in read_jsonl(path):
        cnt += 1
    return cnt

# --- Run: preview the top 3 records and count rows for both files ---
for name in FILE_BASENAMES:
    preview_file(name, n_preview=3)

print("\n=== Row count ===")
for name in FILE_BASENAMES:
    try:
        total = count_rows(name)
        print(f"{name}: {total:,} lines")
    except FileNotFoundError as e:
        print(e)



=== comment_code (preview 3 rows) ===
[1] keys=['comment', 'function_code', 'version']
{'comment': '/* Initializes contract with initial supply tokens to the creator of the contract */', 'function_code': 'function KEKEcon(){\r\n        balanceOf[msg.sender] = 100000000000000000; // Give the creator all initial tokens\r\n        totalSupply = 100000000000000000;                        // Update total supply...', 'version': '0.4.23'}
[2] keys=['comment', 'function_code', 'version']
{'comment': '/* Internal transfer, only can be called by this contract */', 'function_code': 'function _transfer(address _from, address _to, uint _value) internal {\r\n        require (_to != 0x0);                               // Prevent transfer to 0x0 address. Use burn() instead\r\n        requ...', 'version': '0.4.23'}
[3] keys=['comment', 'function_code', 'version']
{'comment': '/// @notice Remove `_value` tokens from the system irreversibly\n/// @param _value the amount of money to burn', 'function_code

In [11]:

import pandas as pd
def load_dataframe(basename: str) -> "pd.DataFrame":
    path = BASE_DIR / basename
    rows = list(read_jsonl(path))
    return pd.DataFrame(rows)
df_comment = load_dataframe("comment_code")
df_req_fsm_code = load_dataframe("requirement_fsm_code")


In [12]:
display(df_comment)

Unnamed: 0,comment,function_code,version
0,/* Initializes contract with initial supply to...,function KEKEcon(){\r\n balanceOf[msg.s...,0.4.23
1,"/* Internal transfer, only can be called by th...","function _transfer(address _from, address _to,...",0.4.23
2,/// @notice Remove `_value` tokens from the sy...,function burn(uint256 _value) returns (bool su...,0.4.23
3,/**\r\n * @dev Distributes the rewards to the ...,"function multisend(address[] dests, uint256[] ...",0.4.25
4,//human 0.1 standard. Just an arbitrary versio...,function Breakbits(\r\n\r\n ) {\r\n\r\n...,0.4.25
...,...,...,...
9629,// changed here each value to one for unique n...,"function _mintBatch(\r\n address to,\r\...",0.8.7
9630,/// Mint tokens when the release stage is whit...,function whitelistMint(uint256 qty) external p...,0.8.9
9631,/// Claim tokens for team/giveaways.\n/// @par...,function ownerClaim(address[] calldata address...,0.8.9
9632,/// Returns rewards for address for reward act...,function getRewardRate(address from) external ...,0.8.9


In [13]:
display(df_req_fsm_code)

Unnamed: 0,user_requirement,FSM,version,code
0,This smart contract is a [Zap-in transaction c...,"{\n ""contractName"": ""ZapInContract"",\n ""...",0.5.17,// SPDX-License-Identifier: MIT\npragma solidi...
1,This smart contract is a [token contract] desi...,"{\n ""contractName"": ""KikiInuToken"",\n ""i...",0.8.9,// SPDX-License-Identifier: MIT\npragma solidi...
2,This smart contract is an [atomic swap contrac...,"{\n ""contractName"": ""NebliDex_AtomicSwap_ER...",0.4.26,pragma solidity ^0.4.26;\r\n\r\n//This is the ...
3,This smart contract is a [decentralized exchan...,"{\n ""contractName"": ""UniswapERC20"",\n ""i...",0.5.11,pragma solidity ^0.5.11;\r\n\r\n\r\n\r\nlibrar...
4,This smart contract is an [auction contest con...,"{\n ""contractName"": ""AuctionContest"",\n ...",0.5.0,pragma solidity ^0.5.0;\n\ncontract AuctionCon...
...,...,...,...,...
21971,This smart contract is a **decentralized treas...,"{\n ""contractName"": ""TreasuryVault"",\n ""...",0.6.6,// SPDX-License-Identifier: MIT\npragma solidi...
21972,This smart contract is a [reward distribution ...,"{\n ""contractName"": ""RewardDistributionPool...",0.6.12,// SPDX-License-Identifier: MIT\npragma solidi...
21973,This smart contract is a [token contract] call...,"{\n ""contractName"": ""CannabanC"",\n ""inhe...",0.4.25,pragma solidity ^0.4.18;\r\n\r\n// -----------...
21974,This smart contract is a [crowdsale contract] ...,"{\n ""contractName"": ""CrowdsaleContract"",\n ...",0.6.4,// SPDX-License-Identifier: MIT\npragma solidi...


In [15]:
df_comment.to_csv("comment_code.csv", index=False)
df_req_fsm_code.to_csv("requirement_fsm_code.csv", index=False)
print("Saved comment_code.csv and requirement_fsm_code.csv")

Saved comment_code.csv and requirement_fsm_code.csv
