In [6]:
from transformers import RobertaTokenizer
import pandas as pd

# Load CodeBERT tokenizer (same as RoBERTa-base for tokenization)
tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")

# Load sample code from Excel dataset
df = pd.read_excel("/content/Regraded_Volunteering_Grader2.xlsx")
code_column = "Code_with_Error"  # Adjust if needed
sample_codes = df[code_column].dropna().head(2).tolist()  # Get first 2 code samples

# Tokenize and compare
for i, code in enumerate(sample_codes):
    print(f"\n Sample {i+1} Original Code:\n{code}\n")

    tokens = tokenizer.tokenize(code)
    token_ids = tokenizer.convert_tokens_to_ids(tokens)

    print(f" Number of Tokens: {len(tokens)}")
    print(" Tokens:")
    print(tokens)
    print("\n Token IDs:")
    print(token_ids)



 Sample 1 Original Code:

def prime_factors(n):
    factors = []
    divisor = 2
    while n > 1:
        while n // divisor == 0:
            factors.append(divisor)
            n /= divisor
        divisor += 2
    return factors

number = 56
print(f"Prime factors of {number} are: {prime_factors(number)}")


 Number of Tokens: 136
 Tokens:
['Ċ', 'def', 'Ġprime', '_', 'fact', 'ors', '(', 'n', '):', 'Ċ', 'Ġ', 'Ġ', 'Ġ', 'Ġfactors', 'Ġ=', 'Ġ[]', 'Ċ', 'Ġ', 'Ġ', 'Ġ', 'Ġdiv', 'is', 'or', 'Ġ=', 'Ġ2', 'Ċ', 'Ġ', 'Ġ', 'Ġ', 'Ġwhile', 'Ġn', 'Ġ>', 'Ġ1', ':', 'Ċ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġwhile', 'Ġn', 'Ġ//', 'Ġdiv', 'is', 'or', 'Ġ==', 'Ġ0', ':', 'Ċ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġfactors', '.', 'append', '(', 'div', 'is', 'or', ')', 'Ċ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġn', 'Ġ/', '=', 'Ġdiv', 'is', 'or', 'Ċ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġdiv', 'is', 'or', 'Ġ+=', 'Ġ2', 'Ċ', 'Ġ', 'Ġ', 'Ġ', 'Ġreturn', 'Ġfactors', 'Ċ', 'Ċ', 'numbe

In [7]:
import pandas as pd

def chunk_code_by_lines(code, chunk_size):
    """Splits a block of code into chunks of max `chunk_size` lines."""
    if pd.isna(code): return []  # Handle NaN entries
    lines = code.split('\n')
    return ['\n'.join(lines[i:i+chunk_size]) for i in range(0, len(lines), chunk_size)]

# Parameters
file_path = '/content/Regraded_Volunteering_Grader2.xlsx'  # Update if needed
code_column = 'Code_with_Error'                            # Update with actual code column name
chunk_size = 4                                             # Lines per chunk

# Read the dataset
df = pd.read_excel(file_path)

# Apply chunking
chunked_data = []
for idx, row in df.iterrows():
    code_chunks = chunk_code_by_lines(row[code_column], chunk_size)
    for i, chunk in enumerate(code_chunks):
        chunked_data.append({
            'original_index': idx,
            'chunk_number': i,
            'code_chunk': chunk
        })

# Create and save chunked DataFrame
chunked_df = pd.DataFrame(chunked_data)
chunked_df.to_csv('/content/chunked_code_dataset.csv', index=False)  # Save to desired path

print(" Chunked file saved as 'chunked_code_dataset.csv'")
print(chunked_df.head())


 Chunked file saved as 'chunked_code_dataset.csv'
   original_index  chunk_number  \
0               0             0   
1               0             1   
2               0             2   
3               0             3   
4               1             0   

                                          code_chunk  
0  \ndef prime_factors(n):\n    factors = []\n   ...  
1      while n > 1:\n        while n // divisor =...  
2          divisor += 2\n    return factors\n\nnu...  
3  print(f"Prime factors of {number} are: {prime_...  
4  \ndef prime_factors(n):\n    factors = []\n   ...  


In [8]:
from google.colab import files
files.download('/content/chunked_code_dataset.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [12]:
from transformers import RobertaTokenizer
import pandas as pd

# Load CodeBERT tokenizer (same as RoBERTa-base for tokenization)
tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")

# Load sample code from Excel dataset
df = pd.read_csv("/content/chunked_code_dataset.csv")
code_column = "code_chunk"  # Adjust if needed
sample_codes = df[code_column].dropna().head(2).tolist()  # Get first 2 code samples

# Tokenize and compare
for i, code in enumerate(sample_codes):
    print(f"\n Sample {i+1} Chunked Code:\n{code}\n")

    tokens = tokenizer.tokenize(code)
    token_ids = tokenizer.convert_tokens_to_ids(tokens)

    print(f" Number of Tokens: {len(tokens)}")
    print(" Tokens:")
    print(tokens)
    print("\n Token IDs:")
    print(token_ids)



 Sample 1 Chunked Code:

def prime_factors(n):
    factors = []
    divisor = 2

 Number of Tokens: 25
 Tokens:
['Ċ', 'def', 'Ġprime', '_', 'fact', 'ors', '(', 'n', '):', 'Ċ', 'Ġ', 'Ġ', 'Ġ', 'Ġfactors', 'Ġ=', 'Ġ[]', 'Ċ', 'Ġ', 'Ġ', 'Ġ', 'Ġdiv', 'is', 'or', 'Ġ=', 'Ġ2']

 Token IDs:
[50118, 9232, 2654, 1215, 24905, 994, 1640, 282, 3256, 50118, 1437, 1437, 1437, 2433, 5457, 48081, 50118, 1437, 1437, 1437, 14445, 354, 368, 5457, 132]

 Sample 2 Chunked Code:
    while n > 1:
        while n // divisor == 0:
            factors.append(divisor)
            n /= divisor

 Number of Tokens: 63
 Tokens:
['Ġ', 'Ġ', 'Ġ', 'Ġwhile', 'Ġn', 'Ġ>', 'Ġ1', ':', 'Ċ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġwhile', 'Ġn', 'Ġ//', 'Ġdiv', 'is', 'or', 'Ġ==', 'Ġ0', ':', 'Ċ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġfactors', '.', 'append', '(', 'div', 'is', 'or', ')', 'Ċ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġn', 'Ġ/', '=', 'Ġdiv', 'is', 'or']

 Token IDs:
[1437, 1437, 1437, 150

In [13]:
import pandas as pd

def chunk_code_by_lines(code, chunk_size):
    """Splits a block of code into chunks of max `chunk_size` lines."""
    lines = code.split('\n')
    return ['\n'.join(lines[i:i+chunk_size]) for i in range(0, len(lines), chunk_size)]

# Parameters
file_path = '/content/Cleaned_Regraded_Volunteering_Grader2(1).xlsx'     # Update with your actual file path
code_column = 'Code_with_Error'               # Update with the name of your code column
chunk_size = 10                    # Max lines per chunk

# Read the dataset
df = pd.read_excel(file_path)

# Apply chunking
chunked_data = []

for idx, row in df.iterrows():
    code_chunks = chunk_code_by_lines(row[code_column], chunk_size)
    for i, chunk in enumerate(code_chunks):
        chunked_data.append({
            'original_index': idx,
            'chunk_number': i,
            'code_chunk': chunk
        })

# Create a new DataFrame with the chunks
chunked_df = pd.DataFrame(chunked_data)

# Save or inspect
chunked_df.to_csv('chunked_code_logic_dataset.csv', index=False)
print(chunked_df.head())


   original_index  chunk_number  \
0               0             0   
1               0             1   
2               1             0   
3               1             1   
4               2             0   

                                          code_chunk  
0  def prime_factors(n):\n    factors = []\n    d...  
1                                   number = 56\n}")  
2  def prime_factors(n):\n    factors = []\n    d...  
3                                   number = 56\n}")  
4  def prime_factors(n):\n    factors = []\n    d...  


In [14]:
from google.colab import files
files.download('/content/chunked_code_logic_dataset.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [18]:
from transformers import RobertaTokenizer
import pandas as pd

# Load CodeBERT tokenizer (same as RoBERTa-base for tokenization)
tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")

# Load sample code from Excel dataset
df = pd.read_csv("/content/chunked_code_logic_dataset.csv")
code_column = "code_chunk"  # Adjust if needed
sample_codes = df[code_column].dropna().head(2).tolist()  # Get first 2 code samples

# Tokenize and compare
for i, code in enumerate(sample_codes):
    print(f"\n Sample {i+1} Chunked Code:\n{code}\n")

    tokens = tokenizer.tokenize(code)
    token_ids = tokenizer.convert_tokens_to_ids(tokens)

    print(f" Number of Tokens: {len(tokens)}")
    print(" Tokens:")
    print(tokens)
    print("\n Token IDs:")
    print(token_ids)



 Sample 1 Chunked Code:
def prime_factors(n):
    factors = []
    divisor = 2
    while n > 1:
        while n // divisor == 0:
            factors.append(divisor)
            n /= divisor
        divisor += 2
    return factors


 Number of Tokens: 108
 Tokens:
['def', 'Ġprime', '_', 'fact', 'ors', '(', 'n', '):', 'Ċ', 'Ġ', 'Ġ', 'Ġ', 'Ġfactors', 'Ġ=', 'Ġ[]', 'Ċ', 'Ġ', 'Ġ', 'Ġ', 'Ġdiv', 'is', 'or', 'Ġ=', 'Ġ2', 'Ċ', 'Ġ', 'Ġ', 'Ġ', 'Ġwhile', 'Ġn', 'Ġ>', 'Ġ1', ':', 'Ċ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġwhile', 'Ġn', 'Ġ//', 'Ġdiv', 'is', 'or', 'Ġ==', 'Ġ0', ':', 'Ċ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġfactors', '.', 'append', '(', 'div', 'is', 'or', ')', 'Ċ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġn', 'Ġ/', '=', 'Ġdiv', 'is', 'or', 'Ċ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġdiv', 'is', 'or', 'Ġ+=', 'Ġ2', 'Ċ', 'Ġ', 'Ġ', 'Ġ', 'Ġreturn', 'Ġfactors', 'Ċ']

 Token IDs:
[9232, 2654, 1215, 24905, 994, 1640, 282, 3256, 50118, 1437, 1437, 1437, 2433, 5457