In [2]:
import os
import json
from sentence_transformers import SentenceTransformer, util

# Initialize a pre-trained model
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# List of category descriptions
category_descriptions = {
    "sorting": "sorting elements, bubble sort, quick sort, merge sort, heap sort",
    "searching": "searching elements, binary search, linear search, search algorithms",
    "mathematical operations": "mathematical operations, addition, subtraction, trigonometry, calculus",
    "data structures": "data structures, linked list, stack, queue, tree, graph",
    "string manipulation": "manipulating strings, string operations, string matching, regular expressions",
    "file handling": "file handling, reading files, writing files, file operations",
    "networking": "networking, socket programming, network protocols, client-server communication",
    "web scraping": "web scraping, parsing HTML, extracting data from websites, web crawling",
    "machine learning": "machine learning, deep learning, neural networks, training models, prediction",
    "other": "miscellaneous functions"
}

# Encode category descriptions
category_embeddings = {category: model.encode(description, convert_to_tensor=True) for category, description in category_descriptions.items()}

cache_file = 'func_name_to_category_cache.json'

# Load cached mappings if available
if os.path.exists(cache_file):
    with open(cache_file, 'r') as f:
        func_name_to_category = json.load(f)
else:
    func_name_to_category = {}

def map_func_name_to_category(func_name):
    if func_name in func_name_to_category:
        return func_name_to_category[func_name]
    
    func_name_embedding = model.encode(func_name, convert_to_tensor=True)
    similarities = {category: util.pytorch_cos_sim(func_name_embedding, embedding).item() for category, embedding in category_embeddings.items()}
    category = max(similarities, key=similarities.get)
    func_name_to_category[func_name] = category

    # Save the updated mappings to the cache file
    with open(cache_file, 'w') as f:
        json.dump(func_name_to_category, f)
    
    return category

# Test the mapping function with some example function names
func_names = ["sort_array", "binary_search", "calculate_sum", "linked_list_operations", "show_prediction_labels_on_image"]

for func_name in func_names:
    category = map_func_name_to_category(func_name)
    print(f"Function Name: {func_name} -> Category: {category}")




Function Name: sort_array -> Category: sorting
Function Name: binary_search -> Category: searching
Function Name: calculate_sum -> Category: mathematical operations
Function Name: linked_list_operations -> Category: data structures
Function Name: show_prediction_labels_on_image -> Category: machine learning


In [3]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset

# Define the mapping between function names and categories
label_mapping = {
    "sorting": 0,
    "searching": 1,
    "mathematical operations": 2,
    "data structures": 3,
    "string manipulation": 4,
    "file handling": 5,
    "networking": 6,
    "web scraping": 7,
    "machine learning": 8,
    "other": 9
}

# Load the CodeSearchNet dataset
dataset = load_dataset("code_search_net", "python")

# Pre-process the dataset
def preprocess_function(examples):
    examples["label"] = [label_mapping.get(map_func_name_to_category(func), label_mapping["other"]) for func in examples["func_name"]]
    return tokenizer(examples["whole_func_string"], truncation=True, padding="max_length", max_length=128)

tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")

# Select a shard of the dataset for quicker processing
shard_size = 16000  # Define the shard size
train_dataset = dataset['train'].shard(index=0, num_shards=len(dataset["train"]) / shard_size)
validation_dataset = dataset['test'].shard(index=0, num_shards=len(dataset["train"]) / shard_size)

# Tokenize the datasets
tokenized_train_datasets = train_dataset.map(preprocess_function, batched=True, batch_size=1000)
tokenized_validation_datasets = validation_dataset.map(preprocess_function, batched=True, batch_size=1000)

# Load pre-trained model
model = AutoModelForSequenceClassification.from_pretrained("microsoft/codebert-base", num_labels=len(label_mapping))

# Define the training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    logging_steps=100,
    weight_decay=0.01,
)

# Define the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_datasets,
    eval_dataset=tokenized_validation_datasets,
)

# Fine-tune the model
trainer.train()

# Evaluate the model
stats = trainer.evaluate()
print(f"Stats of the trained model: {stats}")

# Save the model
model.save_pretrained("p1Model")
tokenizer.save_pretrained("p1Tokenizer")

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
Map: 100%|██████████| 16000/16000 [00:04<00:00, 3930.86 examples/s]
Map: 100%|██████████| 861/861 [00:00<00:00, 5280.92 examples/s]
  return self.fget.__get__(instance, owner)()
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  5%|▌         | 100/2000 [00:25<08:01,  3.95it/s]

{'loss': 2.0916, 'grad_norm': 5.750487327575684, 'learning_rate': 1.9e-05, 'epoch': 0.1}


 10%|█         | 200/2000 [00:53<08:27,  3.55it/s]

{'loss': 2.0054, 'grad_norm': 5.591728210449219, 'learning_rate': 1.8e-05, 'epoch': 0.2}


 15%|█▌        | 300/2000 [01:23<07:44,  3.66it/s]

{'loss': 1.8736, 'grad_norm': 8.125833511352539, 'learning_rate': 1.7e-05, 'epoch': 0.3}


 20%|██        | 400/2000 [02:08<12:11,  2.19it/s]

{'loss': 1.7739, 'grad_norm': 8.508023262023926, 'learning_rate': 1.6000000000000003e-05, 'epoch': 0.4}


 25%|██▌       | 500/2000 [02:54<11:34,  2.16it/s]

{'loss': 1.674, 'grad_norm': 9.839591979980469, 'learning_rate': 1.5000000000000002e-05, 'epoch': 0.5}


 30%|███       | 600/2000 [03:47<10:40,  2.19it/s]  

{'loss': 1.664, 'grad_norm': 11.845390319824219, 'learning_rate': 1.4e-05, 'epoch': 0.6}


 35%|███▌      | 700/2000 [04:32<08:58,  2.41it/s]

{'loss': 1.5995, 'grad_norm': 10.147231101989746, 'learning_rate': 1.3000000000000001e-05, 'epoch': 0.7}


 40%|████      | 800/2000 [07:14<09:01,  2.22it/s]   

{'loss': 1.6077, 'grad_norm': 9.830946922302246, 'learning_rate': 1.2e-05, 'epoch': 0.8}


 45%|████▌     | 900/2000 [07:59<08:17,  2.21it/s]

{'loss': 1.6122, 'grad_norm': 10.988487243652344, 'learning_rate': 1.1000000000000001e-05, 'epoch': 0.9}


 50%|█████     | 1000/2000 [08:45<07:38,  2.18it/s]

{'loss': 1.5474, 'grad_norm': 10.532018661499023, 'learning_rate': 1e-05, 'epoch': 1.0}


                                                   
 50%|█████     | 1000/2000 [09:00<07:38,  2.18it/s]

{'eval_loss': 1.6047215461730957, 'eval_runtime': 8.5249, 'eval_samples_per_second': 100.998, 'eval_steps_per_second': 6.334, 'epoch': 1.0}


 55%|█████▌    | 1100/2000 [09:46<06:53,  2.17it/s]  

{'loss': 1.4512, 'grad_norm': 12.35280704498291, 'learning_rate': 9e-06, 'epoch': 1.1}


 60%|██████    | 1200/2000 [10:32<06:10,  2.16it/s]

{'loss': 1.4314, 'grad_norm': 16.215505599975586, 'learning_rate': 8.000000000000001e-06, 'epoch': 1.2}


 65%|██████▌   | 1300/2000 [11:19<05:26,  2.14it/s]

{'loss': 1.4322, 'grad_norm': 10.061102867126465, 'learning_rate': 7e-06, 'epoch': 1.3}


 70%|███████   | 1400/2000 [12:05<04:38,  2.16it/s]

{'loss': 1.3665, 'grad_norm': 14.795998573303223, 'learning_rate': 6e-06, 'epoch': 1.4}


 75%|███████▌  | 1500/2000 [12:50<03:22,  2.47it/s]

{'loss': 1.3061, 'grad_norm': 11.171121597290039, 'learning_rate': 5e-06, 'epoch': 1.5}


 80%|████████  | 1600/2000 [21:15<02:57,  2.26it/s]    

{'loss': 1.3835, 'grad_norm': 14.642250061035156, 'learning_rate': 4.000000000000001e-06, 'epoch': 1.6}


 85%|████████▌ | 1700/2000 [22:00<02:17,  2.18it/s]

{'loss': 1.368, 'grad_norm': 15.009904861450195, 'learning_rate': 3e-06, 'epoch': 1.7}


 90%|█████████ | 1800/2000 [22:46<01:32,  2.17it/s]

{'loss': 1.3711, 'grad_norm': 14.178555488586426, 'learning_rate': 2.0000000000000003e-06, 'epoch': 1.8}


 95%|█████████▌| 1900/2000 [23:33<00:45,  2.18it/s]

{'loss': 1.3772, 'grad_norm': 15.135271072387695, 'learning_rate': 1.0000000000000002e-06, 'epoch': 1.9}


100%|██████████| 2000/2000 [24:20<00:00,  2.17it/s]

{'loss': 1.403, 'grad_norm': 14.262889862060547, 'learning_rate': 0.0, 'epoch': 2.0}


                                                   
100%|██████████| 2000/2000 [24:35<00:00,  1.36it/s]


{'eval_loss': 1.555184006690979, 'eval_runtime': 8.3436, 'eval_samples_per_second': 103.193, 'eval_steps_per_second': 6.472, 'epoch': 2.0}
{'train_runtime': 1475.6221, 'train_samples_per_second': 21.686, 'train_steps_per_second': 1.355, 'train_loss': 1.5669732208251954, 'epoch': 2.0}


100%|██████████| 54/54 [00:08<00:00,  6.70it/s]


Evaluation results: {'eval_loss': 1.555184006690979, 'eval_runtime': 8.0904, 'eval_samples_per_second': 106.423, 'eval_steps_per_second': 6.675, 'epoch': 2.0}


('p1_saved_tokenizer\\tokenizer_config.json',
 'p1_saved_tokenizer\\special_tokens_map.json',
 'p1_saved_tokenizer\\vocab.json',
 'p1_saved_tokenizer\\merges.txt',
 'p1_saved_tokenizer\\added_tokens.json',
 'p1_saved_tokenizer\\tokenizer.json')

In [6]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

# Define the label mapping
labels = {
    0: "sorting",
    1: "searching",
    2: "mathematical operations",
    3: "data structures",
    4: "string manipulation",
    5: "file handling",
    6: "networking",
    7: "web scraping",
    8: "machine learning",
    9: "other"
}

# Load the fine-tuned model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained("p1Model")
tokenizer = AutoTokenizer.from_pretrained("p1Tokenizer")

# Create a pipeline for text classification
classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)

# Define the code snippets
code_snippets = [
    "sorted(l, key=lambda x: (-int(x[1]), x[0]))",
    "def multiply(a, b): return a * b",
    "def divide(a, b): return a / b",
    "words = text.split(' ') \\ last = words[0] \\ for word in words: \\ if word > last: \\ last = word \\ return last",
    "def loss_fn(y_true, y_pred): return tf.reduce_mean(tf.square(y_true - y_pred))",
    "def func(x, y): return x * y + x"
]

# Classify the code snippets
for code_snippet in code_snippets:
    # Classify the code snippet
    classification_result = classifier(code_snippet)

    # Extract the label and map it to the class name
    label = int(classification_result[0]['label'].split('_')[-1])  # Extract numerical label from 'LABEL_0', 'LABEL_1', etc.
    class_name = labels.get(label, "Unknown")

    print(f"Code Snippet: {code_snippet}")
    print(f"Classification: {class_name}, Score: {classification_result[0]['score']}\n")


Code Snippet: sorted(l, key=lambda x: (-int(x[1]), x[0]))
Classification: sorting, Score: 0.7280794382095337

Code Snippet: def multiply(a, b): return a * b
Classification: mathematical operations, Score: 0.3808503746986389

Code Snippet: def divide(a, b): return a / b
Classification: mathematical operations, Score: 0.3947147727012634

Code Snippet: words = text.split(' ') \ last = words[0] \ for word in words: \ if word > last: \ last = word \ return last
Classification: string manipulation, Score: 0.5963256359100342

Code Snippet: def loss_fn(y_true, y_pred): return tf.reduce_mean(tf.square(y_true - y_pred))
Classification: other, Score: 0.2912917733192444

Code Snippet: def func(x, y): return x * y + x
Classification: other, Score: 0.5813041925430298

