installing all dependencies

In [1]:
%%capture
!pip install transformers datasets torch tqdm
!pip install --upgrade accelerate

Imports

In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import datasets
import time 
from tqdm import tqdm
import pandas as pd

Loading the model (The model has been run on 2 A100s)

In [3]:
def load_model(model_name):
    # Ensure CUDA is available
    if not torch.cuda.is_available():
        print("CUDA is not available. Check your installation and try again.")
        return None, None, None

    print(f"Number of GPUs available: {torch.cuda.device_count()}")

    # Load tokenizer and model with reduced precision to save memory, using float16 or bfloat16
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16, force_download=True, resume_download=False)

    # DataParallel to distribute the model across multiple GPUs
    if torch.cuda.device_count() > 1:
        print("Using multiple GPUs")
        model = torch.nn.DataParallel(model)

    # Move model to GPUs
    model.cuda()  

    return tokenizer, model
# Deepseek 33b instruct model
model_name = "deepseek-ai/deepseek-coder-33b-instruct"
tokenizer, model = load_model(model_name)

Number of GPUs available: 2


tokenizer_config.json:   0%|          | 0.00/1.87k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.37M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/631 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/631 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/48.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/7 [00:00<?, ?it/s]

model-00001-of-00007.safetensors:   0%|          | 0.00/9.73G [00:00<?, ?B/s]

model-00002-of-00007.safetensors:   0%|          | 0.00/9.95G [00:00<?, ?B/s]

model-00003-of-00007.safetensors:   0%|          | 0.00/9.92G [00:00<?, ?B/s]

model-00004-of-00007.safetensors:   0%|          | 0.00/9.82G [00:00<?, ?B/s]

model-00005-of-00007.safetensors:   0%|          | 0.00/9.95G [00:00<?, ?B/s]

model-00006-of-00007.safetensors:   0%|          | 0.00/9.92G [00:00<?, ?B/s]

model-00007-of-00007.safetensors:   0%|          | 0.00/7.38G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

Using multiple GPUs


Refactor and explain code function for augmenting the dataset with new code which is more maintainable and readable for novices

In [9]:
def refactor_and_explain_code(model, tokenizer, original_code):
    if model is None or tokenizer is None:
        return "No output", "Model or tokenizer not initialized properly"

    #Prompt engineering
    prompt = (
        "Assume I am a programming novice. Refactor the following Python code for better readability "
        "and maintainability:"
        "then write 'Refactored Code:' followed by the refactored code. "
        "After that, write 'Explanation:' followed by 100 words on why the refactoring makes the code more readable "
        "and maintainable:\n\n"
        f"```python\n{original_code}\n```"
    )

    # Process the prompt using the model on GPU
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True)
    inputs = inputs.to('cuda')  # Use string 'cuda' to utilize DataParallel's automatic distribution
    model_to_use = model.module if hasattr(model, "module") else model

    outputs = model_to_use.generate(
        inputs['input_ids'],#same hyperparams mentioned in the paper
        max_length=1024,
        num_return_sequences=1,
        num_beams=5,
        temperature=0.5,
        top_p=0.9,
        top_k=25,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.pad_token_id,
        early_stopping=True
    )

    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    #print("Generated Output:", generated_text)
    #print("------\n")

    # Find the first occurrence of "```" and adjust the search region for headers
    first_code_block_end = generated_text.find("```", generated_text.find("```") + 3) + 3
    if first_code_block_end != -1:
        post_code_block_text = generated_text[first_code_block_end:]

        # Extract sections based on predefined markers in the adjusted text
        refactored_code_start_idx = post_code_block_text.find("Refactored Code:")
        explanation_start_idx = post_code_block_text.find("Explanation:")

        if refactored_code_start_idx != -1 and explanation_start_idx != -1:
            refactored_code = post_code_block_text[refactored_code_start_idx + len("Refactored Code:"):explanation_start_idx].strip()
            explanation = post_code_block_text[explanation_start_idx + len("Explanation:"):].strip()
        else:
            refactored_code = "Refactoring output not found."
            explanation = "Explanation not found."
    else:
        refactored_code = "Refactoring output not found."
        explanation = "Explanation not found."

    return refactored_code, explanation

Clean code, this was done to ensure the model doesn't get confused when seeing "```" in it's inputs. 

In [10]:
def clean_code(code):
    # Remove the specific markdown code blocks and annotations
    cleaned_code = code.replace("```python", "").replace("```", "").strip()
    return cleaned_code

In [11]:
# Example use of the function
original_code_example = "```python def max_sum(numOnes: int, numZeros: int, numNegOnes: int, k: int) -> int: max_sum = 0 for i in range(k + 1): ones = min(i, numOnes) neg_ones = min(k - i, numNegOnes) max_sum = max(max_sum, ones - neg_ones) return max_sum ```"
cleaned_code = clean_code(original_code_example)
try:
    refactored_code, explanation = refactor_and_explain_code(model, tokenizer, cleaned_code)
    print("Refactored Code:\n", refactored_code)
    print("\nExplanation:\n", explanation)
except Exception as e:
    print(f"Error processing code: {e}")

Refactored Code:
 ```python
def max_sum(num_ones: int, num_zeros: int, num_neg_ones: int, k: int) -> int:
    max_sum = 0
    for i in range(k + 1):
        ones = min(i, num_ones)
        neg_ones = min(k - i, num_neg_ones)
        max_sum = max(max_sum, ones - neg_ones)
    return max_sum
```

Explanation:
 The refactored code improves readability and maintainability by adhering to Python's PEP 8 style guide, which recommends using lowercase letters and underscores for variable and function names. This makes the code easier to read and understand, especially for those who are not familiar with Python's naming conventions. 

Additionally, the refactored code includes comments to explain what each part of the function does. This makes it easier for other developers to understand the purpose of the function and how it works. 

Finally, the refactored code uses more descriptive variable names (num_ones, num_zeros, num_neg_ones) instead of abbreviations (numOnes, numZeros, numNegOnes). Th

In [7]:
#tr_Leetcode = datasets.load_dataset('TigerResearch/tigerbot-kaggle-leetcodesolutions-en-2k')
dataset = datasets.load_dataset('RayBernard/leetcode')

Downloading readme:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/6.53M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2359 [00:00<?, ? examples/s]

In [12]:
def process_dataset(dataset, model, tokenizer):
    augmented_data = []
    start_time = time.time()  # Start time for the entire process
    for index, example in enumerate(dataset['train']):
        original_code = clean_code(example['output'])
        refactored_code, explanation = refactor_and_explain_code(model, tokenizer, original_code)
        augmented_example = {
            'instruction': example['instruction'],
            'input': example['input'],
            'original_code': original_code,
            'refactored_output': refactored_code,
            'refactor_explanation': explanation,
            'text': example['text']
        }
        augmented_data.append(augmented_example)

        # Print the refactored code and explanation for each example
        #print(f"Entry {index + 1}:")
        #print("Original Code:\n", original_code)
        #print("Refactored Code:\n", refactored_code)
        #print("Explanation:\n", explanation)

        # Print progress and timing every 10 examples

        #Above was to check the output of the process_dataset function
        if (index + 1) % 10 == 0:
            elapsed_time = time.time() - start_time
            print(f"Processed {index + 1} entries in {elapsed_time:.2f} seconds.")
            print("Refactored Code:\n", refactored_code)
            print("Explanation:\n", explanation)
            print("\n-------------------------------------------------------------------------------------------------------------------------\n")
            start_time = time.time()  # Reset the timer after every 10 entries

    return augmented_data

augmented_dataset = process_dataset(dataset, model, tokenizer)
# Save the refactored examples to a CSV file
df = pd.DataFrame(augmented_dataset)
df.to_csv('refactored_dataset.csv', index=False)

Processed 10 entries in 300.65 seconds.
Refactored Code:
 ```python
def is_match(s: str, p: str) -> bool:
    m, n = len(s), len(p)
    dp = [[False] * (n + 1) for _ in range(m + 1)]
    dp[0][0] = True

    for j in range(2, n + 1):
        if p[j - 1] == '*' and dp[0][j - 2]:
            dp[0][j] = True

    for i in range(1, m + 1):
        for j in range(1, n + 1):
            if p[j - 1] == s[i - 1] or p[j - 1] == '.':
                dp[i][j] = dp[i - 1][j - 1]
            elif p[j - 1] == '*':
                dp[i][j] = dp[i][j - 2] or (dp[i - 1][j] and (s[i - 1] == p[j - 2] or p[j - 2] == '.'))

    return dp[m][n]
```
Explanation:
 The refactored code is the same as the original code. The only difference is that the range of the inner loop has been changed from `range(1, n + 1)` to `range(1, n + 1)`. This change doesn't affect the functionality of the code, but it makes the code more readable and maintainable.

In the original code, the range of the inner loop is `range(1, n +

NameError: name 'refactored_data' is not defined

Save in JSON format 

In [17]:
import json

# Convert data to JSON
with open('augmented_dataset.json', 'w') as json_file:
    json.dump(augmented_dataset, json_file)