In [1]:
from ollama import chat
from ollama import ChatResponse
import os
import sys
import re
import pandas as pd
import tqdm as tqdm
from transformers import AutoModel, AutoTokenizer
import torch
import pickle
from sklearn.neural_network import MLPClassifier


# Testing Chat

In [3]:
file = open("../datasets/VedranLjubovic/chosen/Z1-Z1-4647.c", "r")
code = "\n".join(file.readlines())

In [4]:
response = chat(model='deepseek-r1:14b', messages=[
        {
            'role': 'user',
            'content': """Generate a first-year university assignment question that aligns with the following code written by a student. The code may deviate from the original assignment requirements, so avoid focusing on specific details like variable names. Instead, derive a general problem statement that captures the core concept and intended learning outcomes of the assignment.

Write the assignment question in clear, structured English, formatted into paragraphs, providing clear learning outcomes. Additionally, provide a brief summary of the assignment to give an overview of its purpose.
Please respond with only a single JSON object and no additional text. The JSON object should contain two fields: 'Assignment' and 'Summary'. The 'Assignment' field should contain the assignment question, and the 'Summary' field should contain a brief summary of the assignment.""" + code,
        },
    ])

In [5]:
print(response)

model='deepseek-r1:14b' created_at='2025-03-04T03:25:59.1902979Z' done=True done_reason='stop' total_duration=131339204800 load_duration=13572400 prompt_eval_count=1768 prompt_eval_duration=3025000000 eval_count=1117 eval_duration=128299000000 message=Message(role='assistant', content="<think>\nOkay, so the user provided a C program that calculates final grades for three students based on their scores. I need to generate an assignment question that aligns with this code but without focusing on specific details like variable names. \n\nFirst, I should figure out what the main functionality of the code is. It looks like it's taking various inputs such as partial exam scores, attendance, homework, and final exam for each student. Then, it calculates a total score for each and determines their final grade based on that total.\n\nThe program also has some logic to check how many students passed (score above 55) and whether any have the same grades or all different. So, the core concept is a

# Testing Processing Data (regex)

In [77]:
sampleResponse = """
<think>
Okay, so I have to generate a university assignment question based on the provided code. Let me first try to understand what the code is doing.

Looking at the code, it seems like it's reading multiple scores for different students: Tarik, Bojan, and Mirza. Each has an "I parcijalni ispit" (first partial exam), "II parcijalni ispit" (second partial exam), Prisustvo (attendance), Zadace (assignment), and Zavrsni ispit (final exam). The code calculates total scores for each student by summing these components. Then, it computes overall grades based on these totals using specific grade thresholds.

The main function calculates three variables: bb for Bojan's total, bm for Mirza's, and bt for Tarik's. Each of these is checked against ranges to determine the final grade (ob, om, ot respectively). The code then checks if all three students have passed by seeing if their grades are above 5. It also handles cases where some but not all have passed or where they all pass with possibly different grades.

So, the assignment question should reflect this structure: reading scores for each exam point, calculating totals, determining final grades based on those totals, and then evaluating the results to see how many students have passed and their relative performance.

I need to make sure the assignment is clear in English, broken into paragraphs, and surrounded by 'EOF'. It should include reading specific variables, checking validity of inputs, calculating totals, assigning grades, and analyzing the results.    
</think>
asdfjadslkjf
---a
asdfasd
asdfasdf
---
sdfds
"""

sample2 = """hello?"""

In [92]:
matches = re.search(r'<think>(.*?)</think>.*?---(.*?)---', sampleResponse, re.DOTALL)
print(matches[2])

a
asdfasd
asdfasdf



In [70]:
matches = re.search(r'h([el]*)', sample2)
print(matches.groups())

('ell',)


# Testing Data Importing (pandas)

In [None]:
# init
source = pd.DataFrame(columns=['question', 'identifier'])

In [None]:
print(source)

    question identifier
0  something        id2


In [None]:
# inserting
source.loc[-1] = ['test', 'id2']
source.index = source.index + 1  # shifting index
source = source.sort_index()  # sorting by index


In [None]:
source.loc[(source['identifier'] == 'id2'), 'question'] = "something2"
print(source.loc[(source['identifier'] == 'id2')])

     question identifier
0  something2        id2


In [None]:
print(source.loc[(source['identifier'] == 'id3')].empty)


True


In [None]:
source.insert(-1,['testingg', 'id5'])


TypeError: DataFrame.insert() missing 1 required positional argument: 'value'

# Testing (transformers)

In [17]:
merged = pd.read_pickle("data/prepared/code.pkl")
print(merged)

                                                  code  label
0    #include <stdio.h>\n\n#include <string.h>\n\n#...  human
1    #include <stdio.h>\n\n#include <string.h>\n\n#...  human
2    #include <stdio.h>\n\n#include <math.h>\n\n#in...  human
3    #include <stdio.h>\n\n#include <ctype.h>\n\n#i...  human
4    #include <stdio.h>\n\n#include <stdlib.h>\n\n#...  human
..                                                 ...    ...
101  \n#include <stdio.h>\n#include <stdlib.h>\n\ni...    llm
102  pp\n#include <iostream>\n#include <sstream>\n#...    llm
103  \nint a, b, c;\nscanf("%d%d%d", &a, &b, &c);\n...    llm
104  \n#include <stdio.h>\n#include <stdlib.h>\n#in...    llm
105  \n#include <stdio.h>\n\nint main() {\n    char...    llm

[106 rows x 2 columns]


In [18]:

print(f"Loaded code. Count: {merged.shape}")
embeddings = merged['code'].tolist()
output = merged['label'].to_frame()
for index, row in merged.iterrows():
  embeddings[index] = index

output['embeddings'] = embeddings
print(output)

Loaded code. Count: (106, 2)
     label  embeddings
0    human           0
1    human           1
2    human           2
3    human           3
4    human           4
..     ...         ...
101    llm         101
102    llm         102
103    llm         103
104    llm         104
105    llm         105

[106 rows x 2 columns]


# Testing (embeddings)

In [22]:
embed = pd.read_pickle("data/prepared/embeddings.pkl")

In [23]:
print(embed)

    actual label                                    code_embeddings
0          human  [tensor(0.0473), tensor(0.0058), tensor(-0.105...
1          human  [tensor(0.0279), tensor(0.0271), tensor(-0.131...
2          human  [tensor(0.0167), tensor(0.0246), tensor(-0.155...
3          human  [tensor(0.0357), tensor(0.0082), tensor(-0.140...
4          human  [tensor(0.0058), tensor(-0.0067), tensor(-0.14...
..           ...                                                ...
101          llm  [tensor(-0.0508), tensor(-0.0017), tensor(-0.1...
102          llm  [tensor(-0.0224), tensor(0.0025), tensor(-0.05...
103          llm  [tensor(-0.0389), tensor(0.0448), tensor(-0.05...
104          llm  [tensor(-0.0577), tensor(0.0204), tensor(-0.16...
105          llm  [tensor(-0.0310), tensor(0.0079), tensor(-0.07...

[106 rows x 2 columns]


# testing Questions

In [None]:
questions = pd.read_pickle("data/ai-code/questions.pkl")
print(questions)

# Testing Models

In [7]:
model_path = "data/models-nn.file"
file = open(model_path, "rb")
model = pickle.load(file)

In [11]:
print(model['code_'][0].get_params())
print(MLPClassifier().get_params())

{'activation': 'relu', 'alpha': 0.0001, 'batch_size': 'auto', 'beta_1': 0.9, 'beta_2': 0.999, 'early_stopping': False, 'epsilon': 1e-08, 'hidden_layer_sizes': (100,), 'learning_rate': 'constant', 'learning_rate_init': 0.001, 'max_fun': 15000, 'max_iter': 200, 'momentum': 0.9, 'n_iter_no_change': 10, 'nesterovs_momentum': True, 'power_t': 0.5, 'random_state': None, 'shuffle': True, 'solver': 'adam', 'tol': 0.0001, 'validation_fraction': 0.1, 'verbose': False, 'warm_start': False}
{'activation': 'relu', 'alpha': 0.0001, 'batch_size': 'auto', 'beta_1': 0.9, 'beta_2': 0.999, 'early_stopping': False, 'epsilon': 1e-08, 'hidden_layer_sizes': (100,), 'learning_rate': 'constant', 'learning_rate_init': 0.001, 'max_fun': 15000, 'max_iter': 200, 'momentum': 0.9, 'n_iter_no_change': 10, 'nesterovs_momentum': True, 'power_t': 0.5, 'random_state': None, 'shuffle': True, 'solver': 'adam', 'tol': 0.0001, 'validation_fraction': 0.1, 'verbose': False, 'warm_start': False}


In [3]:
oj = pd.read_pickle("datasets/test/programs.pkl")
print(oj)

           0                                                  1   2
0          0  int main()\n{\n\tint a;\n\tint bai,wushi,ershi...  97
1          1  int main()\n{\n    int m,x100,x50,x20,x10,x5,x...  97
2          2  int main()\n{\n    int n,i,shuzu[111],count1=0...  97
3          3  int main()\n{\n\tint n,a1=0,a2=0,a3=0,a4=0,a5=...  97
4          4  int main()\n{\n\tint n,a,b,c,d,e,f;\n\ta=0;b=0...  97
...      ...                                                ...  ..
51996  51996  int topjudge(int a[][20],int m,int n,int p,int...  72
51997  51997  int main()\n{\n    int m,n;\n    cin>>m>>n;\n ...  72
51998  51998  void sort (int *data,int n)\n{\n     int i,j,*...  72
51999  51999  void main()\n{\n\tint m,n,i,j,a[20][20],b[20][...  72
52000  52000  int main()\n{\n\tint s[100][100],m,n,i,j,k,l;\...  72

[52001 rows x 3 columns]


In [17]:
code = pd.read_pickle("data/ai-code/codenet-14b.code.pkl")
print(code)
thing = code['code'].tolist()
print(thing[0])

  identifier                                               code actual label
0     p04030  #include <stdio.h>\n\nint main() {\n    char b...           ai
1     p03242  #include <stdio.h>\n\nint main() {\n    int n;...           ai
2     p03001  #include <stdio.h>\n#include <stdlib.h>\n#incl...           ai
3     p02971                                               None    ai-failed
4     p02407  #include <stdio.h>\n\nint main() {\n    int n;...           ai
5     p02400  #include <stdio.h>\n\nint main() {\n    double...           ai
6     p02256  #include <stdio.h>\n\nint main() {\n    int a,...           ai
7     p00002  #include <stdio.h>\n\nint main() {\n    int a,...           ai
#include <stdio.h>

int main() {
    char buffer[20];
    int top = 0;

    char c;
    while ((c = getchar()) != '\n' && c != EOF) {
        if (c == '0' || c == '1') {
            if (top < 20) {
                buffer[top++] = c;
            }
        } else {
            if (top > 0) {
                

In [27]:
code2 = pd.read_pickle("data/prepared/codenet.test.emb.pkl")
print(code2)

   actual label                                    code_embeddings
51        human  [-0.0010866094, -0.014708949, -0.056743935, -0...
61        human  [-0.022720896, 0.02549858, -0.063253075, 0.005...
36        human  [-0.037212994, 0.079306334, -0.048354577, 0.05...
93          llm  [-0.0019667156, 0.03843533, -0.06323702, 0.007...
73        human  [-0.040071465, 0.0051495354, -0.056042094, -0....
70        human  [-0.026629213, 0.022276266, -0.030905021, -0.0...
62        human  [-0.032478448, 0.032909617, -0.059994373, -0.0...
30        human  [-0.053434443, 0.042813323, -0.07148988, 0.072...
45        human  [-0.031213911, -0.0077929166, -0.050019678, 0....
2         human  [-0.023581663, 0.03130884, -0.06990072, -0.013...
