<a href="https://colab.research.google.com/github/UmeshGayashan/Software_Marks_Giving/blob/main/Properly_Submission_Files.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 1. Mount Google Drive

In [1]:
import os
from glob import glob
from bs4 import BeautifulSoup

from google.colab import drive
drive.mount('/content/drive')

DATASET_PATH = '/content/drive/MyDrive/SS_Dataset'

Mounted at /content/drive


## 2. Extract problem statements from Problems/*.html

In [2]:
def extract_problem_text(html_file):
    with open(html_file, 'r', encoding='utf-8') as f:
        soup = BeautifulSoup(f, 'html.parser')
        text = soup.get_text(separator="\n", strip=True)
        # Extract first 150 words
        words = text.split()
        return ' '.join(words[:150])

problem_texts = {}  # {problem_id: first150words}
problems_dir = os.path.join(DATASET_PATH, "Problems")
html_files = glob(os.path.join(problems_dir, "p*.html"))

for html_file in html_files:
    problem_id = os.path.splitext(os.path.basename(html_file))[0]   # 'p00000'
    problem_texts[problem_id] = extract_problem_text(html_file)

print(f"Extracted {len(problem_texts)} problem statements.")

Extracted 10 problem statements.


## 3. Gather up to 3 .c files for every problem

In [3]:
samples_per_problem = 3
all_samples = {}  # {problem_id: [file1, file2, file3] }

problem_dirs = [d for d in os.listdir(DATASET_PATH) if d.startswith('p') and os.path.isdir(os.path.join(DATASET_PATH, d))]
for pid in problem_dirs:
    c_folder = os.path.join(DATASET_PATH, pid, 'C')
    if os.path.isdir(c_folder):
        c_files = sorted(glob(os.path.join(c_folder, '*.c')))
        all_samples[pid] = c_files[:samples_per_problem]

# Print the samples with the first 150 words of the problem
for pid in sorted(all_samples.keys()):
    print(f"\nProblem: {pid}")
    # Print first 150 words (if available)
    problem_preview = problem_texts.get(pid, "[No statement found]")
    print(f"Statement (first 150 words):\n{problem_preview}\n")
    for i, fpath in enumerate(all_samples[pid], 1):
        print(f"  Sample {i}: {fpath}")



Problem: p00000
Statement (first 150 words):
QQ Write a program which prints multiplication tables in the following format: 1x1=1 1x2=2 . . 9x8=72 9x9=81 Input No input. Output 1x1=1 1x2=2 . . 9x8=72 9x9=81 Template for C #include<stdio.h> int main(){ return 0; } Template for C++ #include<iostream> using namespace std; int main(){ return 0; } Template for Java class Main{ public static void main(String[] a){ } }

  Sample 1: /content/drive/MyDrive/SS_Dataset/p00000/C/s000369988.c
  Sample 2: /content/drive/MyDrive/SS_Dataset/p00000/C/s000552118.c
  Sample 3: /content/drive/MyDrive/SS_Dataset/p00000/C/s000899239.c

Problem: p00001
Statement (first 150 words):
List of Top 3 Hills There is a data which provides heights (in meter) of mountains. The data is only for ten mountains. Write a program which prints heights of the top three mountains in descending order. Input Height of mountain 1 Height of mountain 2 Height of mountain 3 . . Height of mountain 10 Constraints 0 ≤ height of mounta

In [4]:
import os
import pandas as pd
from glob import glob
from bs4 import BeautifulSoup # HTML parsing
from tqdm import tqdm # Progress bars

from transformers import AutoTokenizer, AutoModel # Embedding models (CodeBERT) - To get meaningful vector representations of code and text using pre-trained machine learning models.
import torch
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error


In [5]:
samples = []
MAX_SAMPLES_PER_PROBLEM = 50
for pid in problem_dirs:
    c_folder = os.path.join(DATASET_PATH, pid, 'C')
    if os.path.isdir(c_folder):
        c_files = sorted(glob(os.path.join(c_folder, '*.c')))
        for c_file in c_files[:MAX_SAMPLES_PER_PROBLEM]:
            with open(c_file, 'r', encoding='utf-8', errors='ignore') as f:
                code = f.read()
            samples.append({
                'problem_id': pid,
                'problem_text': problem_texts.get(pid, ""),
                'code': code,
                'code_path': c_file
            })

import pandas as pd
df = pd.DataFrame(samples)
print(df.head())


  problem_id                                       problem_text  \
0     p00001  List of Top 3 Hills There is a data which prov...   
1     p00001  List of Top 3 Hills There is a data which prov...   
2     p00001  List of Top 3 Hills There is a data which prov...   
3     p00001  List of Top 3 Hills There is a data which prov...   
4     p00001  List of Top 3 Hills There is a data which prov...   

                                                code  \
0  #include <stdio.h>\nint main(void)\n{\n\tint a...   
1  #include <stdio.h>\nint main(void){\n    // He...   
2  #include <stdio.h>\n#define N 10\nvoid sort(in...   
3  i,x[4];c(int*a){a=*1[&a]-*a;}main(j){for(;j--+...   
4  C(int*a){return*a-*1[&a];}h[11];main(i){for(;i...   

                                           code_path  
0  /content/drive/MyDrive/SS_Dataset/p00001/C/s00...  
1  /content/drive/MyDrive/SS_Dataset/p00001/C/s00...  
2  /content/drive/MyDrive/SS_Dataset/p00001/C/s00...  
3  /content/drive/MyDrive/SS_Dataset/p00

In [6]:
# Load CodeBERT
tok = AutoTokenizer.from_pretrained("microsoft/codebert-base")
model = AutoModel.from_pretrained("microsoft/codebert-base")

def get_embedding(text, max_length=256):
    # Handles text truncation and cuda if available
    inputs = tok(text, return_tensors='pt', truncation=True, max_length=max_length)
    with torch.no_grad():
        outputs = model(**inputs)
        return outputs.last_hidden_state.mean(dim=1).cpu().numpy().flatten()

# Compute embeddings, save as new columns
df['problem_emb'] = None
df['code_emb'] = None

for i, row in tqdm(df.iterrows(), total=len(df)):
    df.at[i, 'problem_emb'] = get_embedding(row['problem_text'])
    df.at[i, 'code_emb'] = get_embedding(row['code'])

# Merge (concatenate) embeddings for model input
import numpy as np
X = np.vstack([np.concatenate([row['problem_emb'], row['code_emb']]) for _, row in df.iterrows()])


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/498 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

  return forward_call(*args, **kwargs)


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

100%|██████████| 450/450 [12:10<00:00,  1.62s/it]


# Import Needed Libraries

In [13]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
import joblib


# Generate Random "Good" Marks (14–15)

In [14]:
# For reproducibility
np.random.seed(42)

# Assign a random mark between 14 (inclusive) and 15 (exclusive) to each answer
df['mark'] = np.random.uniform(14, 15, size=len(df))


# Prepare Input Features and Target

In [15]:
# X = np.vstack([np.concatenate([row['problem_emb'], row['code_emb']]) for _, row in df.iterrows()])
y = df['mark'].values


# Train/Test Split

In [16]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)


# Train the RandomForest Model

In [17]:
rf = RandomForestRegressor(n_estimators=100, random_state=1)
rf.fit(X_train, y_train)


# Export the Model for Download

In [18]:
# Save the trained model to your Google Drive for download
model_path = '/content/drive/MyDrive/SS_Dataset/marks_model_rf.joblib'
joblib.dump(rf, model_path)
print(f"Model saved at: {model_path}")

Model saved at: /content/drive/MyDrive/SS_Dataset/marks_model_rf.joblib
