In [None]:
from ollama import chat
from ollama import ChatResponse
import os
import sys
import re
import pandas as pd
import tqdm as tqdm
from transformers import AutoModel, AutoTokenizer
import torch
import pickle
from sklearn.neural_network import MLPClassifier
codenet_dir = "datasets/Project_CodeNet"
data_output_dir = "data/human-written"
problem_output_dir = "data/ai-code"

# Testing Chat

In [None]:
file = open("../datasets/VedranLjubovic/chosen/Z1-Z1-4647.c", "r")
code = "\n".join(file.readlines())

In [None]:
codeProblems = pd.read_pickle(problem_output_dir + "/codenet-full-1.pbm.pkl")
problem = codeProblems.loc[39]['question']
print(problem)

In [None]:
response = chat(model='llama3.1', messages=[ #14b
  {
  'role': 'system',
  'content': 'Write only C language code for the given coding question'},
{
  'role': 'user',
  'content': problem,
},

])
print(response)

In [None]:
response = chat(model='deepseek-r1:14b', messages=[
        {
            'role': 'user',
            'content': """Generate a first-year university assignment question that aligns with the following code written by a student. The code may deviate from the original assignment requirements, so avoid focusing on specific details like variable names. Instead, derive a general problem statement that captures the core concept and intended learning outcomes of the assignment.

Write the assignment question in clear, structured English, formatted into paragraphs, providing clear learning outcomes. Additionally, provide a brief summary of the assignment to give an overview of its purpose.
Please respond with only a single JSON object and no additional text. The JSON object should contain two fields: 'Assignment' and 'Summary'. The 'Assignment' field should contain the assignment question, and the 'Summary' field should contain a brief summary of the assignment.""" + code,
        },
    ])

In [None]:
print(response)

# Testing Processing Data (regex)

In [None]:
sampleResponse = """
<think>
Okay, so I have to generate a university assignment question based on the provided code. Let me first try to understand what the code is doing.

Looking at the code, it seems like it's reading multiple scores for different students: Tarik, Bojan, and Mirza. Each has an "I parcijalni ispit" (first partial exam), "II parcijalni ispit" (second partial exam), Prisustvo (attendance), Zadace (assignment), and Zavrsni ispit (final exam). The code calculates total scores for each student by summing these components. Then, it computes overall grades based on these totals using specific grade thresholds.

The main function calculates three variables: bb for Bojan's total, bm for Mirza's, and bt for Tarik's. Each of these is checked against ranges to determine the final grade (ob, om, ot respectively). The code then checks if all three students have passed by seeing if their grades are above 5. It also handles cases where some but not all have passed or where they all pass with possibly different grades.

So, the assignment question should reflect this structure: reading scores for each exam point, calculating totals, determining final grades based on those totals, and then evaluating the results to see how many students have passed and their relative performance.

I need to make sure the assignment is clear in English, broken into paragraphs, and surrounded by 'EOF'. It should include reading specific variables, checking validity of inputs, calculating totals, assigning grades, and analyzing the results.    
</think>
asdfjadslkjf
---a
asdfasd
asdfasdf
---
sdfds
"""

sample2 = """hello?"""

In [None]:
matches = re.search(r'<think>(.*?)</think>.*?---(.*?)---', sampleResponse, re.DOTALL)
print(matches[2])

In [None]:
matches = re.search(r'h([el]*)', sample2)
print(matches.groups())

# Testing Data Importing (pandas)

In [None]:
# init
source = pd.DataFrame(columns=['question', 'identifier'])

In [None]:
print(source)

In [None]:
# inserting
source.loc[-1] = ['test', 'id2']
source.index = source.index + 1  # shifting index
source = source.sort_index()  # sorting by index


In [None]:
source.loc[(source['identifier'] == 'id2'), 'question'] = "something2"
print(source.loc[(source['identifier'] == 'id2')])

In [None]:
print(source.loc[(source['identifier'] == 'id3')].empty)


In [None]:
source.insert(-1,['testingg', 'id5'])


# Testing (transformers)

In [None]:
merged = pd.read_pickle("data/prepared/code.pkl")
print(merged)

In [None]:

print(f"Loaded code. Count: {merged.shape}")
embeddings = merged['code'].tolist()
output = merged['label'].to_frame()
for index, row in merged.iterrows():
  embeddings[index] = index

output['embeddings'] = embeddings
print(output)

# Testing (embeddings)

In [None]:
embed = pd.read_pickle("data/prepared/embeddings.pkl")

In [None]:
print(embed)

# testing Questions

In [None]:
questions = pd.read_pickle("data/ai-code/questions.pkl")
print(questions)

# Testing Models

In [None]:
model_path = "data/models-nn.file"
file = open(model_path, "rb")
model = pickle.load(file)

In [None]:
print(model['code_'][0].get_params())
print(MLPClassifier().get_params())

In [None]:
oj = pd.read_pickle("datasets/test/programs.pkl")
print(oj)

In [None]:
code = pd.read_pickle("data/ai-code/codenet-14b.code.pkl")
print(code)
thing = code['code'].tolist()
print(thing[0])

In [None]:
code2 = pd.read_pickle("../data/prepared/codenet-codebert.emb.pkl")
print(code2)

In [None]:
code2['code_embeddings'] = code2['code_embeddings'].apply(lambda x: x.flatten().tolist() if x is not None else None)
print(code2)

In [None]:
code_file = pd.read_pickle('data/ai-code/codenet-gemini-1.emb.pkl')
print(code_file)

In [None]:
ai_stat = len(code_file.loc[(code_file['actual label'] == 'ai')]['actual label'].tolist())
human_stat = len(code_file.loc[(code_file['actual label'] == 'human')]['actual label'].tolist())
print(ai_stat, human_stat)

In [None]:
tokenizer = AutoTokenizer.from_pretrained('neulab/codebert-c', trust_remote_code=True)
model:AutoModel = AutoModel.from_pretrained('neulab/codebert-c', trust_remote_code=True).to('cuda')

In [None]:
tokens = tokenizer.tokenize("""

void printAPoint (struct Point arr5 [MAX]) {
    for (int i = 1; i <= MAX; i++) {
        printf("Point %d: \n", i);
        printf("X-Coordinate = %d \n", arr5[i-1].x);
        printf("Y-Coordinate = %d \n", arr5[i-1].y);

        // formatting stuff
        if (i != MAX) {
            printf("\n");
        }
    }
}""")
token_ids = tokenizer.convert_tokens_to_ids(tokens)
print(len(tokenizer))


In [None]:
context_embeddings = model(torch.tensor(token_ids).to('cuda')[None,:])[0]
print(context_embeddings.cpu().detach().numpy())