In [1]:
import json
from pathlib import Path
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor

def get_files(directories):
    """Returns a list of all files from multiple directories."""
    files = []
    for directory in directories:
        files.extend(Path(directory).rglob('*'))
    return files

def process_file(item, index):
    """Processes a single file and returns its JSON entry."""
    try:
        return {
            'label': item.parts[-2],  # Folder name as label
            'index': str(index),
            'code': item.read_text(encoding='latin-1')
        }
    except Exception as e:
        print(f"Error reading {item}: {e}")
        return None

def process_files_in_parallel(files):
    """Processes files in parallel using ThreadPoolExecutor."""
    with ThreadPoolExecutor() as executor:
        # Map each file with its corresponding index
        data = list(tqdm(
            executor.map(lambda p: process_file(p[0], p[1]), zip(files, range(len(files)))), 
            total=len(files)
        ))
    return [d for d in data if d is not None]  # Filter out any failed reads

def write_jsonl(file_path, data, batch_size=1000):
    """Writes the given data to a JSONL file in batches."""
    with open(file_path, 'w') as f:
        for i in range(0, len(data), batch_size):
            batch = data[i:i + batch_size]
            f.writelines(json.dumps(entry) + '\n' for entry in batch)

# Dataset directories
train_dirs = [f"ProgramData/{i}" for i in range(1, 65)]
valid_dirs = [f"ProgramData/{i}" for i in range(65, 81)]
test_dirs = [f"ProgramData/{i}" for i in range(81, 105)]

# Process and write datasets
train_files = get_files(train_dirs)
train_data = process_files_in_parallel(train_files)
write_jsonl("train.jsonl", train_data)

valid_files = get_files(valid_dirs)
valid_data = process_files_in_parallel(valid_files)
write_jsonl("valid.jsonl", valid_data)

test_files = get_files(test_dirs)
test_data = process_files_in_parallel(test_files)
write_jsonl("test.jsonl", test_data)

100%|███████████████████████████████████████████████████████████████████████████| 32000/32000 [01:12<00:00, 443.48it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 8000/8000 [00:19<00:00, 419.60it/s]
100%|███████████████████████████████████████████████████████████████████████████| 12000/12000 [00:28<00:00, 422.13it/s]


In [2]:
import numpy as np
import torch
from datasets import load_dataset
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity

# Load the dataset from JSONL file
dataset = load_dataset('json', data_files='train.jsonl')['train']

# Set a limit to reduce dataset size if needed
LIMIT = 100  # Adjust based on available memory and time constraints
dataset = dataset.shuffle(seed=42).select(range(min(LIMIT, len(dataset))))

# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")

# Use GPU if available for faster processing
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Tokenize the dataset
def tokenize_code(example):
    return tokenizer(
        example['code'], 
        padding='max_length', 
        truncation=True, 
        max_length=512
    )

# Apply tokenization to the dataset
tokenized_dataset = dataset.map(tokenize_code, batched=True)

# Extract original code snippets for later use
candidate_snippets = [example['code'] for example in dataset]

# Function to compute embeddings for a batch of code snippets
def get_batch_embeddings(snippets):
    inputs = tokenizer(
        snippets, return_tensors='pt', 
        padding=True, truncation=True, max_length=512
    )
    inputs = {key: value.to(device) for key, value in inputs.items()}
    
    with torch.no_grad():
        outputs = model(**inputs)
    
    return outputs.last_hidden_state[:, 0, :].cpu().numpy()  # CLS token embeddings

# Generate embeddings for all candidate snippets in batches
batch_size = 16  # Adjust based on available memory
candidate_embeddings = []

for i in range(0, len(candidate_snippets), batch_size):
    batch = candidate_snippets[i:i + batch_size]
    batch_embeddings = get_batch_embeddings(batch)
    candidate_embeddings.extend(batch_embeddings)

# Convert embeddings to NumPy array for efficient computation
candidate_embeddings = np.array(candidate_embeddings)

# Function to retrieve top K semantically similar snippets
def retrieve_top_k_similar(query_code, K):
    query_embedding = get_batch_embeddings([query_code])[0]  # Get embedding for query
    similarities = cosine_similarity([query_embedding], candidate_embeddings)[0]  # Compute similarity
    
    # Get indices of top K similar snippets
    top_k_indices = np.argsort(similarities)[-K:][::-1]
    top_k_snippets = [candidate_snippets[i] for i in top_k_indices]
    top_k_scores = [similarities[i] for i in top_k_indices]
    
    return top_k_snippets, top_k_scores

# Example usage
query_code = """int main(int argc, char* argv[]) {
    int shu[number];
    int n, i, j;
    int k = 0;
    scanf("%d", &shu[0]);
    for (n = 0; shu[n] != 0; n++) {
        scanf("%d", &shu[n + 1]);
    }
    for (i = 0; i <= n; i++) {
        for (j = 0; j <= n; j++) {
            if (shu[i] == 2 * shu[j]) {
                k++;
            }
        }
    }
    if (k != 0) {
        k = k - 1;
        printf("%d", k);
    } else printf("%d", k);
    return 0;
}"""  # Your query code snippet

K = 5  # Number of top similar snippets to retrieve

# Retrieve the top K similar code snippets
top_k_snippets, top_k_scores = retrieve_top_k_similar(query_code, K)

# Print the results
print("Top K Similar Snippets:")
for i, snippet in enumerate(top_k_snippets):
    print(f"\nSnippet {i + 1}:")
    print(snippet)
    print(f"Similarity Score: {top_k_scores[i]:.4f}")


Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Top K Similar Snippets:

Snippet 1:
int main()
{
    int i,n,a[400],b[400],j,k,m;
	scanf("%d",&n);
	for(i=0;i<=n-1;i++)
	{scanf("%d",&a[i]);}
    for(m=0,i=0;i<=n-1;i++)
	{
		for(j=0,k=1;j<=i-1;j++)
		{if (a[i]==a[j])
		{k=0;break;}
		}
		if(k!=0)
		{b[m]=a[i];m++;}
	}
	for(i=0;i<=m-2;i++)
	{ printf("%d,",b[i]);};
	printf("%d",b[m-1]);
}


Similarity Score: 0.9884

Snippet 2:

int main()
{
   int n,k,x[1000];
   int i,j,sign=0;
   
   scanf("%d%d",&n,&k);
   for(i=0;i<n;i++)
   {
      scanf("%d",&x[i]);                
   }    
   
   for(i=0;i<n;i++)
   {
       
       for(j=i+1;j<n;j++)
       {
           if((x[i]+x[j])==k)
           {
               sign=1;
               break;                  
           }                  
       }
                     
   }
   
   if(sign==1)
       printf("yes");
   else
       printf("no");
       
    scanf("%d%d",&n,&k);   
   return 0;
}

Similarity Score: 0.9879

Snippet 3:
int main(){
    int n,k,i,j,x[1000];
    scanf("%d %d",&n,&k)

In [3]:
import numpy as np
import torch
from datasets import load_dataset
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity

# Load the dataset from JSONL file
dataset = load_dataset('json', data_files='train.jsonl')['train']

# Set a limit to reduce dataset size if needed
LIMIT = 200  # Adjust based on available memory and time constraints
dataset = dataset.shuffle(seed=42).select(range(min(LIMIT, len(dataset))))

# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")

# Use GPU if available for faster processing
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Tokenize the dataset
def tokenize_code(example):
    return tokenizer(
        example['code'], 
        padding='max_length', 
        truncation=True, 
        max_length=512
    )

# Apply tokenization to the dataset
tokenized_dataset = dataset.map(tokenize_code, batched=True)

# Extract original code snippets for later use
candidate_snippets = [example['code'] for example in dataset]

# Function to compute embeddings for a batch of code snippets
def get_batch_embeddings(snippets):
    inputs = tokenizer(
        snippets, return_tensors='pt', 
        padding=True, truncation=True, max_length=512
    )
    inputs = {key: value.to(device) for key, value in inputs.items()}
    
    with torch.no_grad():
        outputs = model(**inputs)
    
    return outputs.last_hidden_state[:, 0, :].cpu().numpy()  # CLS token embeddings

# Generate embeddings for all candidate snippets in batches
batch_size = 16  # Adjust based on available memory
candidate_embeddings = []

for i in range(0, len(candidate_snippets), batch_size):
    batch = candidate_snippets[i:i + batch_size]
    batch_embeddings = get_batch_embeddings(batch)
    candidate_embeddings.extend(batch_embeddings)

# Convert embeddings to NumPy array for efficient computation
candidate_embeddings = np.array(candidate_embeddings)

# Function to retrieve top K semantically similar snippets
def retrieve_top_k_similar(query_code, K):
    query_embedding = get_batch_embeddings([query_code])[0]  # Get embedding for query
    similarities = cosine_similarity([query_embedding], candidate_embeddings)[0]  # Compute similarity
    
    # Get indices of top K similar snippets
    top_k_indices = np.argsort(similarities)[-K:][::-1]
    top_k_snippets = [candidate_snippets[i] for i in top_k_indices]
    top_k_scores = [similarities[i] for i in top_k_indices]
    
    return top_k_snippets, top_k_scores

# Example usage
query_code = """int main(int argc, char* argv[]) {
    int shu[number];
    int n, i, j;
    int k = 0;
    scanf("%d", &shu[0]);
    for (n = 0; shu[n] != 0; n++) {
        scanf("%d", &shu[n + 1]);
    }
    for (i = 0; i <= n; i++) {
        for (j = 0; j <= n; j++) {
            if (shu[i] == 2 * shu[j]) {
                k++;
            }
        }
    }
    if (k != 0) {
        k = k - 1;
        printf("%d", k);
    } else printf("%d", k);
    return 0;
}"""  # Your query code snippet

K = 5  # Number of top similar snippets to retrieve

# Retrieve the top K similar code snippets
top_k_snippets, top_k_scores = retrieve_top_k_similar(query_code, K)

# Print the results
print("Top K Similar Snippets:")
for i, snippet in enumerate(top_k_snippets):
    print(f"\nSnippet {i + 1}:")
    print(snippet)
    print(f"Similarity Score: {top_k_scores[i]:.4f}")


Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Top K Similar Snippets:

Snippet 1:
int main(){
	int m=0,k,n,i=0,j=0,a[1000];
	scanf("%d %d",&n,&k);
	for(i=0;i<n;i++){
		scanf("%d",&a[i]);
	}
	i=0;
	for(j=0;j<n;j++){
		i=0;
		for(i=0;i<n;i++){
			if(i!=j){
				if((a[i]+a[j])==k){
					m=1;
					break;
				}
			}
		}
	}
	if(m==1){printf("yes");}
	else{printf("no");}
return 0;
}
Similarity Score: 0.9891

Snippet 2:
int main()
{
    int i,n,a[400],b[400],j,k,m;
	scanf("%d",&n);
	for(i=0;i<=n-1;i++)
	{scanf("%d",&a[i]);}
    for(m=0,i=0;i<=n-1;i++)
	{
		for(j=0,k=1;j<=i-1;j++)
		{if (a[i]==a[j])
		{k=0;break;}
		}
		if(k!=0)
		{b[m]=a[i];m++;}
	}
	for(i=0;i<=m-2;i++)
	{ printf("%d,",b[i]);};
	printf("%d",b[m-1]);
}


Similarity Score: 0.9884

Snippet 3:

int main()
{
   int n,k,x[1000];
   int i,j,sign=0;
   
   scanf("%d%d",&n,&k);
   for(i=0;i<n;i++)
   {
      scanf("%d",&x[i]);                
   }    
   
   for(i=0;i<n;i++)
   {
       
       for(j=i+1;j<n;j++)
       {
           if((x[i]+x[j])==k)
           {
               si

In [None]:
from datasets import load_dataset
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments

# Load dataset from jsonl
train_dataset = load_dataset('json', data_files='train.jsonl', split='train')
valid_dataset = load_dataset('json', data_files='valid.jsonl', split='train')

# Load tokenizer (CodeBERT or GraphCodeBERT)
tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")

# Tokenize dataset (adjust max_length to 512)
def tokenize_code(example):
    return tokenizer(example['code'], padding='max_length', truncation=True, max_length=512)

train_dataset = train_dataset.map(tokenize_code, batched=True)
valid_dataset = valid_dataset.map(tokenize_code, batched=True)

# Convert labels to integers
def process_labels(example):
    example['label'] = int(example['label'])  # Convert label to an integer
    return example

# Apply the function to both train and validation datasets
train_dataset = train_dataset.map(process_labels)
valid_dataset = valid_dataset.map(process_labels)

# Load the model
#codebert_model = RobertaForSequenceClassification.from_pretrained("microsoft/codebert-base", num_labels=65)
codebert_model = RobertaForSequenceClassification.from_pretrained("microsoft/codebert-base", num_labels=81)

# Training setup
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    logging_dir='./logs',
    logging_steps=10,
)

# Trainer setup
trainer = Trainer(
    model=codebert_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset
)

# Fine-tuning
trainer.train()

# Save the model and tokenizer
model_save_path = './fine_tuned_robertamodel'
codebert_model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss


In [None]:
# This is for BERT model

import numpy as np
import torch
from datasets import load_dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics.pairwise import cosine_similarity

# Step 1: Load datasets for fine-tuning
train_dataset = load_dataset('json', data_files='train.jsonl')['train']
valid_dataset = load_dataset('json', data_files='valid.jsonl')['train']
test_dataset = load_dataset('json', data_files='test.jsonl')['train']


def process_labels(example):
    example['label'] = int(example['label'])  # Convert label from string to integer
    return example


train_dataset = train_dataset.map(process_labels)
valid_dataset = valid_dataset.map(process_labels)
test_dataset = test_dataset.map(process_labels)


# Load BERT tokenizer and pre-trained BERT model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=81)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Tokenize datasets
def tokenize_code(example):
    return tokenizer(
        example['code'], 
        padding='max_length', 
        truncation=True, 
        max_length=512
    )

train_dataset = train_dataset.map(tokenize_code, batched=True)
valid_dataset = valid_dataset.map(tokenize_code, batched=True)

# Define TrainingArguments and Trainer
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    logging_dir='./logs',
    logging_steps=10,
    save_steps=10_000,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset
)

# Fine-tune the model
trainer.train()

# Step 2: Embedding generation for similarity retrieval
# Load candidate snippets from candidates.json
candidate_dataset = load_dataset('json', data_files='candidates.json')['train']
candidate_snippets = [example['code'] for example in candidate_dataset]

# Function to compute embeddings for a batch of code snippets
def get_batch_embeddings(snippets):
    inputs = tokenizer(
        snippets, return_tensors='pt', 
        padding=True, truncation=True, max_length=512
    )
    inputs = {key: value.to(device) for key, value in inputs.items()}
    
    with torch.no_grad():
        outputs = model.bert(**inputs)  # Directly access BERT part of model
    return outputs.last_hidden_state[:, 0, :].cpu().numpy()  # CLS token embeddings

# Generate embeddings for candidate snippets in batches
batch_size = 16
candidate_embeddings = []

for i in range(0, len(candidate_snippets), batch_size):
    batch = candidate_snippets[i:i + batch_size]
    batch_embeddings = get_batch_embeddings(batch)
    candidate_embeddings.extend(batch_embeddings)

# Convert embeddings to a NumPy array for cosine similarity calculation
candidate_embeddings = np.array(candidate_embeddings)

# Function to retrieve top K semantically similar snippets
def retrieve_top_k_similar(query_code, K):
    query_embedding = get_batch_embeddings([query_code])[0]  # Get embedding for query
    similarities = cosine_similarity([query_embedding], candidate_embeddings)[0]  # Compute similarity
    
    # Get indices of top K similar snippets
    top_k_indices = np.argsort(similarities)[-K:][::-1]
    top_k_snippets = [candidate_snippets[i] for i in top_k_indices]
    top_k_scores = [similarities[i] for i in top_k_indices]
    
    return top_k_snippets, top_k_scores

# Example usage with a query code snippet
query_code = """// Sample query code to find top-K similar snippets
int sumArray(int arr[], int size) {
    int sum = 0;
    for (int i = 0; i < size; i++) {
        sum += arr[i];
    }
    return sum;
}"""
K = 5

top_k_snippets, top_k_scores = retrieve_top_k_similar(query_code, K)

# Print results
print("Top K Similar Snippets:")
for i, snippet in enumerate(top_k_snippets):
    print(f"\nSnippet {i + 1}:")
    print(snippet)
    print(f"Similarity Score: {top_k_scores[i]:.4f}")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss


In [1]:
# This is for Roberta Model

import numpy as np
import torch
from datasets import load_dataset
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics.pairwise import cosine_similarity

# Step 1: Load datasets for fine-tuning
train_dataset = load_dataset('json', data_files='train_copy.jsonl')['train']
valid_dataset = load_dataset('json', data_files='valid_copy.jsonl')['train']
test_dataset = load_dataset('json', data_files='test_copy.jsonl')['train']

def process_labels(example):
    example['label'] = int(example['label'])  # Convert label from string to integer
    return example

train_dataset = train_dataset.map(process_labels)
valid_dataset = valid_dataset.map(process_labels)
test_dataset = test_dataset.map(process_labels)

# Load RoBERTa tokenizer and pre-trained RoBERTa model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=81)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Tokenize datasets
def tokenize_code(example):
    return tokenizer(
        example['code'], 
        padding='max_length', 
        truncation=True, 
        max_length=512
    )

train_dataset = train_dataset.map(tokenize_code, batched=True)
valid_dataset = valid_dataset.map(tokenize_code, batched=True)

# Define TrainingArguments and Trainer
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    logging_dir='./logs',
    logging_steps=10,
    save_steps=10_000,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset
)

# Fine-tune the model
trainer.train()

# Step 2: Embedding generation for similarity retrieval
# Load candidate snippets from candidates.json
candidate_dataset = load_dataset('json', data_files='candidates.json')['train']
candidate_snippets = [example['code'] for example in candidate_dataset]

# Function to compute embeddings for a batch of code snippets
def get_batch_embeddings(snippets):
    inputs = tokenizer(
        snippets, return_tensors='pt', 
        padding=True, truncation=True, max_length=512
    )
    inputs = {key: value.to(device) for key, value in inputs.items()}
    
    with torch.no_grad():
        outputs = model.roberta(**inputs)  # Directly access RoBERTa part of model
    return outputs.last_hidden_state[:, 0, :].cpu().numpy()  # CLS token embeddings

# Generate embeddings for candidate snippets in batches
batch_size = 16
candidate_embeddings = []

for i in range(0, len(candidate_snippets), batch_size):
    batch = candidate_snippets[i:i + batch_size]
    batch_embeddings = get_batch_embeddings(batch)
    candidate_embeddings.extend(batch_embeddings)

# Convert embeddings to a NumPy array for cosine similarity calculation
candidate_embeddings = np.array(candidate_embeddings)

# Function to retrieve top K semantically similar snippets
def retrieve_top_k_similar(query_code, K):
    query_embedding = get_batch_embeddings([query_code])[0]  # Get embedding for query
    similarities = cosine_similarity([query_embedding], candidate_embeddings)[0]  # Compute similarity
    
    # Get indices of top K similar snippets
    top_k_indices = np.argsort(similarities)[-K:][::-1]
    top_k_snippets = [candidate_snippets[i] for i in top_k_indices]
    top_k_scores = [similarities[i] for i in top_k_indices]
    
    return top_k_snippets, top_k_scores

# Example usage with a query code snippet
query_code = """// Sample query code to find top-K similar snippets
int sumArray(int arr[], int size) {
    int sum = 0;
    for (int i = 0; i < size; i++) {
        sum += arr[i];
    }
    return sum;
}"""
K = 5

top_k_snippets, top_k_scores = retrieve_top_k_similar(query_code, K)

# Print results
print("Top K Similar Snippets:")
for i, snippet in enumerate(top_k_snippets):
    print(f"\nSnippet {i + 1}:")
    print(snippet)
    print(f"Similarity Score: {top_k_scores[i]:.4f}")


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/9 [00:00<?, ? examples/s]

Map:   0%|          | 0/8 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss
1,No log,3.549133
2,No log,2.469843
3,No log,1.888752


Top K Similar Snippets:

Snippet 1:
int binarySearch(int arr[], int left, int right, int x) { while(left <= right) { int mid = left + (right - left) / 2; if(arr[mid] == x) return mid; if(arr[mid] < x) left = mid + 1; else right = mid - 1; } return -1; }
Similarity Score: 0.9726

Snippet 2:
void sortArray(int arr[], int size) { for(int i = 0; i < size - 1; i++) { for(int j = 0; j < size - i - 1; j++) { if(arr[j] > arr[j + 1]) { int temp = arr[j]; arr[j] = arr[j + 1]; arr[j + 1] = temp; } } } }
Similarity Score: 0.9697

Snippet 3:
void reverseArray(int arr[], int size) { int start = 0, end = size - 1; while(start < end) { int temp = arr[start]; arr[start] = arr[end]; arr[end] = temp; start++; end--; } }
Similarity Score: 0.9667

Snippet 4:
int findMax(int arr[], int size) { int max = arr[0]; for(int i = 1; i < size; i++) { if(arr[i] > max) { max = arr[i]; } } return max; }
Similarity Score: 0.9572

Snippet 5:
void printArray(int arr[], int size) { for(int i = 0; i < size; i++) { printf("