In [None]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [None]:
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor
from sklearn.metrics import mean_absolute_error

# File paths
file_paths = {
    "error_embeddings": "avg_mark_code_with_error_embeddings.xlsx",
    "chunked_embeddings": "avg_mark_chunked_code_embeddings.xlsx",
    "chunked_logic": "avg_Chunked_Embedding_Python_code(logic).xlsx"
}

# Function to process dataset with CatBoost
def process_dataset(file_path, name):
    df = pd.read_excel(file_path)

    # Define target and features
    target = "Average Total (Rounded)"
    y = df[target]
    X = df.drop(columns=[target], errors='ignore').select_dtypes(include=np.number)

    # Train/test split
    X_train = X.iloc[5:]
    y_train = y.iloc[5:]
    X_test = X.iloc[:5]
    y_test = y.iloc[:5]

    # Train CatBoost model
    model = CatBoostRegressor(verbose=0, random_state=42)
    model.fit(X_train, y_train)

    # Predict
    y_pred = model.predict(X_test)

    # Results
    result = pd.DataFrame({
        "Dataset": name,
        "Index": X_test.index,
        "Y_Actual": y_test.values,
        "Y_Predicted": y_pred,
        "Absolute_Error": np.abs(y_test.values - y_pred)
    })

    return result

# Combine results from all datasets
final_results = pd.concat([
    process_dataset(file_paths["error_embeddings"], "Error Embeddings"),
    process_dataset(file_paths["chunked_embeddings"], "Chunked Embeddings"),
    process_dataset(file_paths["chunked_logic"], "Chunked Logic")
])

# Reset index for clarity
final_results.reset_index(drop=True, inplace=True)

# Show results
print(final_results)

# Optional: Summary by dataset
summary = final_results.groupby("Dataset")["Absolute_Error"].mean().reset_index()
summary.columns = ["Dataset", "Mean_Absolute_Error"]
print("\nSummary (Lower Error is Better):")
print(summary)


               Dataset  Index  Y_Actual  Y_Predicted  Absolute_Error
0     Error Embeddings      0         8     6.651331        1.348669
1     Error Embeddings      1         8     6.568666        1.431334
2     Error Embeddings      2         8     6.840986        1.159014
3     Error Embeddings      3         8     7.066110        0.933890
4     Error Embeddings      4         8     7.428260        0.571740
5   Chunked Embeddings      0         8     7.290345        0.709655
6   Chunked Embeddings      1         8     7.120361        0.879639
7   Chunked Embeddings      2         8     7.896743        0.103257
8   Chunked Embeddings      3         8     6.974581        1.025419
9   Chunked Embeddings      4         8     7.626843        0.373157
10       Chunked Logic      0         8     7.336868        0.663132
11       Chunked Logic      1         8     7.095735        0.904265
12       Chunked Logic      2         8     7.866401        0.133599
13       Chunked Logic      3     

In [None]:
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor
from sklearn.model_selection import GridSearchCV

# File paths
file_paths = {
    "Error Embeddings": "avg_mark_code_with_error_embeddings.xlsx",
    "Chunked Embeddings": "avg_mark_chunked_code_embeddings.xlsx",
    "Chunked Logic": "avg_Chunked_Embedding_Python_code(logic).xlsx"
}

# Store predictions and actuals by index
prediction_dict = {}
actual_marks = None

# Process all datasets
def train_and_predict(file_path, label):
    df = pd.read_excel(file_path)

    y = df["Average Total (Rounded)"]
    X = df.drop(columns=["Average Total (Rounded)"], errors='ignore').select_dtypes(include=np.number)

    X_train = X.iloc[5:]
    y_train = y.iloc[5:]
    X_test = X.iloc[:5]
    y_test = y.iloc[:5]

    model = CatBoostRegressor(verbose=0, random_state=42)
    param_grid = {
        'depth': [4, 6],
        'learning_rate': [0.05, 0.1],
        'iterations': [50, 100]
    }

    grid = GridSearchCV(model, param_grid, cv=3, scoring='neg_mean_absolute_error', n_jobs=-1)
    grid.fit(X_train, y_train)
    best_model = grid.best_estimator_

    y_pred = best_model.predict(X_test)

    for idx, pred in zip(X_test.index, y_pred):
        if idx not in prediction_dict:
            prediction_dict[idx] = {}
        prediction_dict[idx][label] = pred

    return y_test

# Run for all files and store predictions
for label, path in file_paths.items():
    actual = train_and_predict(path, label)
    if actual_marks is None:
        actual_marks = actual

# Combine into a final comparison table
comparison_df = pd.DataFrame.from_dict(prediction_dict, orient="index")
comparison_df["Y_Actual"] = actual_marks
comparison_df["Prediction_Variation"] = comparison_df.max(axis=1) - comparison_df.min(axis=1)

# Sort by variation in predictions
sorted_by_variation = comparison_df.sort_values(by="Prediction_Variation", ascending=False)

# Pick top 3 variation rows + high, low, mid Y_actual
top_variation_rows = sorted_by_variation.head(3)
high_score_row = comparison_df[comparison_df["Y_Actual"] == comparison_df["Y_Actual"].max()].head(1)
low_score_row = comparison_df[comparison_df["Y_Actual"] == comparison_df["Y_Actual"].min()].head(1)
mid_score_row = comparison_df.loc[(comparison_df["Y_Actual"] - comparison_df["Y_Actual"].mean()).abs().argsort()].head(1)

# Combine unique rows
final_rows = pd.concat([top_variation_rows, high_score_row, low_score_row, mid_score_row]).drop_duplicates()

# Final table for analysis
print("\n🔍 Selected Rows for Analysis (Top Variation + High/Low/Moderate Marks):")
print(final_rows)





🔍 Selected Rows for Analysis (Top Variation + High/Low/Moderate Marks):
   Error Embeddings  Chunked Embeddings  Chunked Logic  Y_Actual  \
1          6.980912            7.096906       7.068449         8   
0          7.027394            7.061922       7.055458         8   
2          7.035543            7.348223       7.232830         8   

   Prediction_Variation  
1              1.019088  
0              0.972606  
2              0.964457  


In [None]:
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor

# File paths
file_paths = {
    "Error Embeddings": "avg_mark_code_with_error_embeddings.xlsx",
    "Chunked Embeddings": "avg_mark_chunked_code_embeddings.xlsx",
    "Chunked Logic": "avg_Chunked_Embedding_Python_code(logic).xlsx"
}

# Store predictions and actuals
prediction_dict = {}
actual_marks = None

# Fast CatBoost run per dataset
def run_fast_model(file_path, label):
    df = pd.read_excel(file_path)

    y = df["Average Total (Rounded)"]
    X = df.drop(columns=["Average Total (Rounded)"], errors='ignore').select_dtypes(include=np.number)

    X_train = X.iloc[5:]
    y_train = y.iloc[5:]
    X_test = X.iloc[:5]
    y_test = y.iloc[:5]

    model = CatBoostRegressor(
        depth=6,
        learning_rate=0.1,
        iterations=50,
        verbose=0,
        random_state=42
    )
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    for idx, pred in zip(X_test.index, y_pred):
        if idx not in prediction_dict:
            prediction_dict[idx] = {}
        prediction_dict[idx][label] = pred

    return y_test

# Run all datasets
for label, path in file_paths.items():
    actual = run_fast_model(path, label)
    if actual_marks is None:
        actual_marks = actual

# Build final DataFrame
comparison_df = pd.DataFrame.from_dict(prediction_dict, orient="index")
comparison_df["Y_Actual"] = actual_marks
comparison_df["Prediction_Variation"] = comparison_df.max(axis=1) - comparison_df.min(axis=1)

# Identify points of interest
top_variation = comparison_df.sort_values(by="Prediction_Variation", ascending=False).head(3)
high_actual = comparison_df[comparison_df["Y_Actual"] == comparison_df["Y_Actual"].max()].head(1)
low_actual = comparison_df[comparison_df["Y_Actual"] == comparison_df["Y_Actual"].min()].head(1)
mid_actual = comparison_df.loc[
    (comparison_df["Y_Actual"] - comparison_df["Y_Actual"].mean()).abs().argsort()
].head(1)

# Combine all selected rows
final_selection = pd.concat([top_variation, high_actual, low_actual, mid_actual]).drop_duplicates()
print("\n🔍 Selected Points (Top Variations + High/Low/Mid Actual):")
print(final_selection)



🔍 Selected Points (Top Variations + High/Low/Mid Actual):
   Error Embeddings  Chunked Embeddings  Chunked Logic  Y_Actual  \
2          6.930458            7.270196       7.241787         8   
0          6.992375            7.124624       7.134850         8   
1          6.992922            7.124464       7.121782         8   

   Prediction_Variation  
2              1.069542  
0              1.007625  
1              1.007078  


In [None]:
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor
from sklearn.preprocessing import StandardScaler

# File paths for your Excel datasets
file_paths = {
    "Error Embeddings": "avg_mark_code_with_error_embeddings.xlsx",
    "Chunked Embeddings": "avg_mark_chunked_code_embeddings.xlsx",
    "Chunked Logic": "avg_Chunked_Embedding_Python_code(logic).xlsx"
}

# Function to train a model and get predictions on first N samples
def get_catboost_predictions(file_path, model_name, top_n=30):
    df = pd.read_excel(file_path)
    target_col = "Average Total (Rounded)"

    y = df[target_col]
    X = df.drop(columns=[target_col], errors='ignore').select_dtypes(include=np.number)

    # Optional: Normalize data
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Split train and test
    X_train, X_test = X_scaled[top_n:], X_scaled[:top_n]
    y_train, y_test = y[top_n:], y[:top_n]

    model = CatBoostRegressor(verbose=0, random_state=42)
    model.fit(X_train, y_train)

    preds = model.predict(X_test)

    return pd.DataFrame({
        "Index": df.index[:top_n],
        "Y_Actual": y_test.values,
        model_name: preds
    })

# Get predictions for all datasets
results = [get_catboost_predictions(path, name, top_n=30) for name, path in file_paths.items()]

# Merge all predictions into one DataFrame
merged_df = results[0]
for df in results[1:]:
    merged_df = pd.merge(merged_df, df, on=["Index", "Y_Actual"])

# Calculate prediction variation
merged_df["Prediction_Variation"] = merged_df[
    ["Error Embeddings", "Chunked Embeddings", "Chunked Logic"]
].max(axis=1) - merged_df[
    ["Error Embeddings", "Chunked Embeddings", "Chunked Logic"]
].min(axis=1)

# Identify interesting samples
top_var = merged_df.sort_values("Prediction_Variation", ascending=False).head(1)
high = merged_df.sort_values("Y_Actual", ascending=False).head(1)
low = merged_df.sort_values("Y_Actual", ascending=True).head(1)
median_actual = merged_df["Y_Actual"].median()
mid = merged_df.iloc[(merged_df["Y_Actual"] - median_actual).abs().argsort()[:1]]

# Combine and drop duplicates
selected_samples = pd.concat([top_var, high, low, mid]).drop_duplicates().reset_index(drop=True)

# Display selected samples
print("📌 Selected Points (High/Low/Mid Actual + Max Variation):")
print(selected_samples)


📌 Selected Points (High/Low/Mid Actual + Max Variation):
   Index  Y_Actual  Error Embeddings  Chunked Embeddings  Chunked Logic  \
0     17         7          7.204461            6.415261       6.355226   
1      0         8          7.155315            7.180914       7.169314   
2      7         6          6.941093            6.508374       6.421728   
3      5         7          7.237700            7.425139       7.514281   

   Prediction_Variation  
0              0.849236  
1              0.025599  
2              0.519365  
3              0.276581  
