In [6]:
!pip install catboost
!pip install openpyxl

Collecting openpyxl
  Downloading openpyxl-3.1.5-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting et-xmlfile (from openpyxl)
  Downloading et_xmlfile-2.0.0-py3-none-any.whl.metadata (2.7 kB)
Downloading openpyxl-3.1.5-py2.py3-none-any.whl (250 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m250.9/250.9 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading et_xmlfile-2.0.0-py3-none-any.whl (18 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-2.0.0 openpyxl-3.1.5


In [11]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from catboost import CatBoostRegressor

# === File paths for each test case ===
file_paths = {
    "Whole Code Block": "/content/code_with_error_embeddings_with_marks.xlsx",
    "Chunked Code Block": "/content/chunked_code_embeddings_with_marks.xlsx",
    "Logic Chunked": "/content/Chunked_Embedding_Python_code(logic).xlsx"
}

# === Dictionary to store results ===
results = {}

# === Loop through each test case ===
for test_name, file_path in file_paths.items():
    print(f"\n=== Running: {test_name} ===")

    # Check if file exists
    if not os.path.exists(file_path):
        print(f"❌ File not found: {file_path}")
        continue

    try:
        # Load dataset (fast engine, limit to numeric data)
        df = pd.read_excel(file_path, engine='openpyxl')

        # Keep only numeric columns (drop raw code strings etc.)
        df_numeric = df.select_dtypes(include=[np.number])

        if df_numeric.shape[1] < 2:
            print(f"❌ Error: Not enough numeric columns in {test_name} to train a model.")
            continue

        # Split into features and target
        X = df_numeric.iloc[:, :-1].values
        y = df_numeric.iloc[:, -1].values

        # Train-test split
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        # Initialize and train CatBoost model
        model = CatBoostRegressor(iterations=100, learning_rate=0.1, verbose=0, random_seed=42)
        model.fit(X_train, y_train)

        # Predict and evaluate
        y_pred = model.predict(X_test)
        r2 = r2_score(y_test, y_pred)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))

        # Store results
        results[test_name] = {"R2": r2, "RMSE": rmse}
        print(f"{test_name} → R2: {r2:.4f}, RMSE: {rmse:.4f}")

    except Exception as e:
        print(f"❌ Error processing {test_name}: {e}")

# === Final Summary ===
print("\n=== Final Evaluation Summary ===")
for name, metrics in results.items():
    print(f"{name}: R2 = {metrics['R2']:.4f}, RMSE = {metrics['RMSE']:.4f}")



=== Running: Whole Code Block ===
Whole Code Block → R2: -0.0965, RMSE: 2.0728

=== Running: Chunked Code Block ===
Chunked Code Block → R2: 0.3341, RMSE: 1.6081

=== Running: Logic Chunked ===
Logic Chunked → R2: 0.9987, RMSE: 0.0043

=== Final Evaluation Summary ===
Whole Code Block: R2 = -0.0965, RMSE = 2.0728
Chunked Code Block: R2 = 0.3341, RMSE = 1.6081
Logic Chunked: R2 = 0.9987, RMSE = 0.0043
