# SAS Knowledge Enhancement Using CodeLlama & ChatGPT-4o
This notebook demonstrates how to enhance CodeLlama's SAS programming knowledge
by leveraging ChatGPT-4o as a teacher model in a knowledge distillation pipeline.

In [None]:
# Step 1: Install Required Libraries
# Make sure to install the necessary libraries before running this notebook.
!pip install transformers datasets openai evaluate peft

In [None]:
# Step 2: Load CodeLlama Model and Tokenizer
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

In [None]:
# Load the pretrained CodeLlama model
model_name = "codellama/CodeLlama-7b"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

In [None]:
# Create a pipeline for easy text generation
llm_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer)

In [None]:
# Step 3: Load SAS Validation Dataset
from datasets import load_dataset

In [None]:
# Load your SAS validation dataset
validation_data = load_dataset('json', data_files={'validation': 'sas_validation.jsonl'})

In [None]:
# Step 4: Baseline Evaluation of CodeLlama's SAS Knowledge
import evaluate

In [None]:
# BLEU score to measure code similarity
bleu = evaluate.load("bleu")

In [None]:
# Function to evaluate the model's performance
def evaluate_model(validation_data):
    results = []
    for item in validation_data['validation']:
        prompt = item['prompt']
        expected = item['expected_output']

        generated = llm_pipeline(prompt, max_length=512)[0]['generated_text']
        score = bleu.compute(predictions=[generated], references=[expected])

        results.append({
            'prompt': prompt,
            'expected': expected,
            'generated': generated,
            'bleu_score': score['bleu']
        })
    return results

In [None]:
# Run the evaluation
eval_results = evaluate_model(validation_data)

In [None]:
# Step 5: Identify Knowledge Gaps
# Filter outputs where BLEU score is below 0.7
gaps = [r for r in eval_results if r['bleu_score'] < 0.7]

In [None]:
# Step 6: Query ChatGPT-4o for Corrections
import openai

In [None]:
# Set your OpenAI API key
openai.api_key = "YOUR_OPENAI_API_KEY"

In [None]:
# Function to query ChatGPT-4o
def query_gpt4o(prompt):
    response = openai.ChatCompletion.create(
        model="gpt-4o",
        messages=[{"role": "user", "content": prompt}]
    )
    return response.choices[0].message['content']

In [None]:
# Request corrections for identified gaps
corrected_data = []
for gap in gaps:
    correction_prompt = f"""
    Review the following SAS code generated by an AI model. Correct any errors or suggest improvements:
    ---
    {gap['generated']}
    ---
    Original question:
    {gap['prompt']}
    """
    corrected_output = query_gpt4o(correction_prompt)
    corrected_data.append({'prompt': gap['prompt'], 'completion': corrected_output})

In [None]:
# Step 7: Fine-Tune CodeLlama with Corrected Data
from transformers import Trainer, TrainingArguments
from datasets import Dataset

In [None]:
# Convert corrected data into a dataset
corrected_dataset = Dataset.from_list(corrected_data)

In [None]:
# Tokenization
def tokenize(batch):
    return tokenizer(batch['prompt'], truncation=True, padding='max_length', max_length=512)

In [None]:
tokenized_data = corrected_dataset.map(tokenize, batched=True)

In [None]:
# Fine-tuning parameters
training_args = TrainingArguments(
    output_dir="./sas_finetuned_model",
    evaluation_strategy="epoch",
    per_device_train_batch_size=4,
    num_train_epochs=2,
    learning_rate=2e-5,
    save_total_limit=2
)

In [None]:
# Fine-tune the model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data
)

In [None]:
# Start fine-tuning
trainer.train()

In [None]:
# Step 8: Final Fine-Tuning with Business-Specific SAS Code
# Load the business-specific SAS dataset
business_data = load_dataset('json', data_files={'train': 'business_sas_data.jsonl'})
tokenized_business_data = business_data.map(tokenize, batched=True)

In [None]:
# Final fine-tuning
final_trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_business_data
)

In [None]:
# Start the final training phase
final_trainer.train()

In [None]:
# Step 9: Save the Final Model
model.save_pretrained("./final_business_sas_model")
tokenizer.save_pretrained("./final_business_sas_model")

This notebook demonstrates an automated loop of evaluating, correcting, and fine-tuning CodeLlama's SAS knowledge,
leveraging ChatGPT-4o for high-quality corrections and business-specific knowledge infusion.