# Setup

### Install the client library

In [1]:
!pip install -q google-generativeai

### Import libraries

In [1]:
import os
import google.generativeai as genai

Get your own API Key from: https://aistudio.google.com/app/apikey

Step 1: Export your API Key as an environment variable

- Windows: setx GEMINI_API_KEY "yourkey"

- Mac/Linux: export GEMINI_API_KEY="yourkey"

Step 2: Restart VSCode or whatever code environment you're using

In [2]:
genai.configure(api_key=os.environ['GEMINI_API_KEY'])

You can check you existing tuned models with the `genai.list_tuned_model` method.

In [3]:
for i, m in zip(range(5), genai.list_tuned_models()):
  old_model = m.name
  print(m.name)

tunedModels/generate-num-6887


# Preparing Training Data

In [4]:
import pandas as pd

data_path = "c_train0.jsonl"
tests_path = "tests_all.jsonl"

df = pd.read_json(data_path, lines=True)
tests_df = pd.read_json(tests_path, lines=True)

In [5]:
df.head()

Unnamed: 0,id,buggy_submission_id,fixed_submission_id,problem_id,user_id,buggy_code,fixed_code,labels,change_count,line_hunks,errors
0,180,271,272,p00000,u772959354,#include <stdio.h>\n\nint main(void) {\n int ...,#include <stdio.h>\n\nint main(void) {\n int ...,"[literal.string.change, call.arguments.change,...",1,1.0,
1,183,275,276,p00000,u690485037,#include <stdio.h>\n#define NINE 9\n\nint main...,#include <stdio.h>\n#define NINE 9\n\nint main...,"[literal.string.change, call.arguments.change,...",1,1.0,
2,184,277,278,p00000,u226511948,"#include <stdio.h>\n\nint main() {\n\n int i,...","#include <stdio.h>\n\nint main() {\n\n int i,...","[literal.string.change, call.arguments.change,...",3,1.0,
3,185,279,278,p00000,u226511948,"#include <stdio.h>\n\nint main() {\n\n int i,...","#include <stdio.h>\n\nint main() {\n\n int i,...","[literal.number.change, assignment.value.chang...",4,2.0,
4,187,281,278,p00000,u226511948,"#include <stdio.h>\n\nint main() {\n\n int i,...","#include <stdio.h>\n\nint main() {\n\n int i,...","[literal.number.change, assignment.value.chang...",1,1.0,


In [6]:
len(df)

22888

In [11]:
# Filter out rows where fixed code length > 5000 because the free Gemini Api has this length limit on the output.
filtered_df = df[df['fixed_code'].str.len() <= 5000]

# Merge the two DataFrames on 'problem_id' for faster access to 'input' and 'output'
merged_df = filtered_df.merge(tests_df[['problem_id', 'input', 'output']], on='problem_id', how='left')

# Create the 'text_input' and 'output' columns directly within the DataFrame
merged_df['text_input'] = (
    "Fix the given buggy code given the code's input and expected output:\n\n"
    "Buggy Code:\n" + merged_df['buggy_code'] + "\n\n"
    "Code Input:\n" + merged_df['input'] + "\n\n"
    "Code Expected Output:\n" + merged_df['output']
)
merged_df['output'] = merged_df['fixed_code']

# Filter out rows where prompt length > 40000 because the free Gemini Api has this length limit on the input.
filtered_merged_df = merged_df[merged_df['text_input'].str.len() <= 40000]

# Sample n rows per type of problem (problem_id)
n = 4
sampled_df = filtered_merged_df.groupby('problem_id', group_keys=False).apply(lambda x: x.sample(min(len(x), n), random_state=42),
                                                                              include_groups=False)

# Select relevant columns and save as CSV
sampled_df[['text_input', 'output']].to_csv('training_data.csv', index=False)

In [12]:
training_df = pd.read_csv('training_data.csv')

In [13]:
training_df.head()

Unnamed: 0,text_input,output
0,Fix the given buggy code given the code's inpu...,"\n#include <stdio.h>\n\nint main() {\n int i,..."
1,Fix the given buggy code given the code's inpu...,"#include <stdio.h>\nint main() {\n int i, j;\..."
2,Fix the given buggy code given the code's inpu...,"#include <stdio.h>\n\nint main() {\n int i, j..."
3,Fix the given buggy code given the code's inpu...,#include <stdio.h>\n\nint main(void) {\n int ...
4,Fix the given buggy code given the code's inpu...,#include <math.h>\n#include <stdio.h>\n\nint m...


In [14]:
len(training_df)

4514

# Training your model

Train it on this website: https://aistudio.google.com/app/tune

## Evaluate your model

You can use the `genai.generate_content` method and specify the name of your model to test your model performance.

In [None]:
model = genai.GenerativeModel(model_name=old_model)

In [45]:
test_df = pd.read_json("c_test0.jsonl", lines=True)

In [53]:
test_df.head()

Unnamed: 0,id,buggy_submission_id,fixed_submission_id,problem_id,user_id,buggy_code,fixed_code,labels,change_count,line_hunks,errors
0,209,310,311,p00000,u940395729,"#include <stdio.h>\nint main() {\n int i, j;\...","#include <stdio.h>\nint main() {\n int i, j;\...","[literal.string.change, call.arguments.change,...",1,1,
1,220,330,331,p00000,u673315234,#include <stdio.h>\n\ntypedef int S4;\n\n#defi...,#include <stdio.h>\n\ntypedef int S4;\n\n#defi...,"[expression.operator.compare.change, control_f...",2,1,
2,284,423,424,p00000,u611853667,"main(n, m) {\n for (; n - 9; (m %= 9) || n++)...","main(n, m) {\n for (m = 0; n <= 9; (m %= 9) |...","[control_flow.loop.for.initializer.change, con...",4,1,
3,286,426,427,p00000,u611853667,"#include <stdio.h>\n\nint main() {\n int a, b...","#include <stdio.h>\n\nint main() {\n int a, b...","[expression.operator.compare.change, control_f...",1,1,
4,374,540,541,p00000,u904505033,"#include <stdio.h>\nmain() {\n int i, j, ans;...","#include <stdio.h>\nmain() {\n int i, j, ans;...","[literal.number.change, control_flow.loop.for....",5,2,


In [74]:
bug_id = 13103
buggy_code = test_df[test_df['id'] == bug_id]['buggy_code'].iloc[0]
fixed_code = test_df[test_df['id'] == bug_id]['fixed_code'].iloc[0]
problem_id = test_df[test_df['id'] == bug_id]['problem_id'].iloc[0]

input = tests_df[tests_df['problem_id'] == problem_id]['input'].iloc[0]
output = tests_df[tests_df['problem_id'] == problem_id]['output'].iloc[0]

In [75]:
prompt = f"""Fix the given buggy code given the code's input and expected output:
    
Buggy Code:
{buggy_code}

Code Input:
{input}

Code Output:
{output}
"""

In [76]:
print(prompt)

Fix the given buggy code given the code's input and expected output:
    
Buggy Code:
#include <math.h>
#include <stdio.h>

#define MAX 999999

int Primes[MAX + 1];

int IsPrime(int n) {
  if (n < 2) {
    return 0;
  }

  else if (n == 2) {
    return 1;
  }

  else if (n % 2 == 0) {
    return 0;
  }

  else {
    int i = 3;
    for (; i < sqrt(n); i += 2) {
      if (n % i == 0) {
        return 0;
      }
    }
  }

  return 1;
}

int main(void) {
  int i;
  for (i = 1; i <= MAX; ++i) {
    Primes[i] += IsPrime(i) + Primes[i - 1];
  }

  int N;
  while (scanf("%d", &N) != EOF) {
    printf("%d\n", Primes[N]);
  }

  return 0;
}


Code Input:
10
10
126

Code Output:
4
4
30




In [None]:
result = model.generate_content(prompt)
print(result.text)

'#include <math.h>\n#include <stdio.h>\n\n#define MAX 999999\n\nint Primes[MAX + 1];\n\nint IsPrime(int n) {\n  if (n < 2) {\n    return 0;\n  }\n\n  else if (n == 2) {\n    return 1;\n  }\n\n  else if (n % 2 == 0) {\n    return 0;\n  }\n\n  else {\n    int i = 3;\n    for (; i <= sqrt(n); i += 2) {\n      if (n % i == 0) {\n        return 0;\n      }\n    }\n  }\n\n  return 1;\n}\n\nint main(void) {\n  int i;\n  for (i = 1; i <= MAX; ++i) {\n    Primes[i] += IsPrime(i) + Primes[i - 1];\n  }\n\n  int N;\n  while (scanf("%d", &N) != EOF) {\n    printf("%d\\n", Primes[N]);\n  }\n\n  return 0;\n}\n'

In [79]:
print(fixed_code)

#include <math.h>
#include <stdio.h>

#define MAX 999999

int Primes[MAX + 1];

int IsPrime(int n) {
  if (n < 2) {
    return 0;
  }

  else if (n == 2) {
    return 1;
  }

  else if (n % 2 == 0) {
    return 0;
  }

  else {
    int i = 3;
    for (; i <= sqrt(n); i += 2) {
      if (n % i == 0) {
        return 0;
      }
    }
  }

  return 1;
}

int main(void) {
  int i;
  for (i = 1; i <= MAX; ++i) {
    Primes[i] += IsPrime(i) + Primes[i - 1];
  }

  int N;
  while (scanf("%d", &N) != EOF) {
    printf("%d\n", Primes[N]);
  }

  return 0;
}



## Delete the model

You can clean up your tuned model list by deleting models you no longer need. Use the `genai.delete_tuned_model` method to delete a model. If you canceled any tuning jobs, you may want to delete those as their performance may be unpredictable.

In [None]:
genai.delete_tuned_model(old_model)

The model no longer exists:

In [None]:
try:
  m = genai.get_tuned_model(old_model)
  print(m)
except Exception as e:
  print(f"{type(e)}: {e}")

<class 'google.api_core.exceptions.NotFound'>: 404 GET https://generativelanguage.googleapis.com/v1beta/tunedModels/generate-num-8122?%24alt=json%3Benum-encoding%3Dint: Tuned model tunedModels/generate-num-8122 does not exist.


