In [32]:
from dotenv import load_dotenv
import os

load_dotenv()

True

## GPT 4o Judge

In [2]:
from openai import OpenAI

openai_api_key= os.getenv('OPENAI_API_KEY')
if openai_api_key is None:
    raise ValueError("API Key is not set.")

client = OpenAI(
    # This is the default and can be omitted
    api_key=openai_api_key,
)

def get_completion(prompt, model="gpt-3.5-turbo"):
    chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": prompt,
        }
    ],
    model="gpt-3.5-turbo",
    )
    return chat_completion.choices[0].message.content

## LLama 3

In [33]:
#https://github.com/ollama/ollama-python?tab=readme-ov-file
import ollama

def get_completion(prompt):
  response = ollama.chat(model='llama3', messages=[
    {
      'role': 'user',
      'content': prompt,
    },
  ])

  return response['message']['content']

In [34]:
def is_code(row):
    text = row.str.cat(sep='\n')
    prompt = f"""
            I need you to analyze a question answer sequence of text and determine whether it contains code written 
            in any programming language. Please respond with "Yes" if the text contains code and "No" if it does not. 
            All langauages are valid, including markup languages. However, pseudocode is not valid. Do not get confused
            by dialogue, math or code-like text.
            Input:
                {text}
            Your Response:
                       
            """
    answer = get_completion(prompt)
    return answer

In [35]:
def code_language(row):
    text = row.str.cat(sep='\n')
    prompt = f"""Analyze the given question-answer sequence containing code. Identify the programming 
    language used in the code. Respond with only the name of the programming language.
    Input
        {text}
    Your response:
        
    """
    answer = get_completion(prompt)
    return answer

## Main

In [38]:
import pandas as pd
import csv

def append_dataframe_to_csv(df, file_path):
    """
    Appends a DataFrame to a CSV file, handling complex text with new lines, quotes, and commas.
    
    Parameters:
    df (pd.DataFrame): The DataFrame to append.
    file_path (str): The path to the CSV file.
    """
    try:
        # Attempt to open the file to check if it exists
        with open(file_path, 'r') as f:
            # If file exists, append without header
            df.to_csv(file_path, mode='a', index=False, header=False, quoting=csv.QUOTE_ALL)
    except FileNotFoundError:
        # If file does not exist, write with header
        df.to_csv(file_path, mode='w', index=False, header=True, quoting=csv.QUOTE_ALL)

In [39]:
from datasets import load_dataset

dataset_name = "yahma/alpaca-cleaned"
dataset = load_dataset(dataset_name)
df = pd.DataFrame(dataset['train'])
new_order = ['instruction', 'input', 'output']
df = df[new_order]
df['code'] = None
df['code_language'] = None

csv_path = 'temp.csv'
chunk_size = 500
     
# Initialize an empty DataFrame to store chunks
chunk = pd.DataFrame(columns=df.columns)
for i, row in df.iterrows():
    df.at[i, 'code'] = is_code(row).upper()
    if df.at[i, 'code'] == "YES":
        df.at[i, 'code_language'] = code_language(row)
    else:
        df.at[i, 'code_language'] = "N/A"

    # Append the updated row to the chunk DataFrame using pd.concat
    chunk = pd.concat([chunk, df.iloc[[i]]])

    # Write to CSV every 500 rows
    if len(chunk) == chunk_size:
        append_dataframe_to_csv(chunk, csv_path)
        chunk = pd.DataFrame(columns=df.columns)  # Reset chunk DataFrame
    
# Write any remaining rows in the chunk
if not chunk.empty:
    append_dataframe_to_csv(chunk, csv_path)


KeyboardInterrupt: 

## Analysis

In [None]:
df = pd.read_csv('code_no_code.csv')
num_rows = len(df)
print("Total number of rows:", num_rows)

df_code = df[df['code'] == 'YES']
num_code_rows = len(df)
print(f"Number of rows with code: {num_code_rows}")
print(f"Number of rows without code: {num_rows - num_code_rows}")

language_counts = df_code['language'].value_counts()
print(f"Language counts: {language_counts}")