In [102]:
from sentence_transformers import SentenceTransformer
import numpy as np
import pandas as pd
import random
import json
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error
from warnings import filterwarnings
# filterwarnings('ignore')

In [63]:
# Load data
train = json.load(open("data/train_data.json"))
test = json.load(open("data/test_data.json"))
metric_embs = np.load("data/metric_name_embeddings.npy")
metric_map = json.load(open("data/metric_names.json"))

In [None]:
# Model for embeddings
model = SentenceTransformer("l3cube-pune/indic-sentence-similarity-sbert")

# Prepare training data
X, y = [], []
for r in train:
    txt = f"{r['system_prompt']} [SEP] {r['user_prompt']} [SEP] {r['response']}"
    text_emb = model.encode(txt, normalize_embeddings=True)
    metric_emb = metric_embs[metric_map.index(r['metric_name'])]
    X.append(np.concatenate([text_emb, metric_emb]))
    y.append(r['score'])

X, y = np.array(X, dtype=np.float32), np.array(y, dtype=np.float32)

# Train-validation split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Prepare test data
X_test= []
for r in test:
    txt = f"{r['system_prompt']} [SEP] {r['user_prompt']} [SEP] {r['response']}"
    text_emb = model.encode(txt, normalize_embeddings=True)
    metric_emb = metric_embs[metric_map.index(r['metric_name'])]
    X_test.append(np.concatenate([text_emb, metric_emb]))

X_test = np.array(X_test, dtype=np.float32)

In [89]:
def degrade_response(resp):
    if random.random() < 0.5:
        return " ".join(resp.split()[:max(1, len(resp.split())//5)])
    elif random.random() > 0.5:
        words = resp.split()
        random.shuffle(words)
        return " ".join(words)
    else:
        return "I don't know."

augmented = []
for r in train:
    if random.random() < 0.3:  # 30% augmentation
        new_r = r.copy()
        if r['response'] is not None: 
            new_r['response'] = degrade_response(r['response']) 
            new_r['score'] = random.uniform(0.5, 2.0)
        augmented.append(new_r)

train_aug = train + augmented

In [None]:
# Model for embeddings
model = SentenceTransformer("l3cube-pune/indic-sentence-similarity-sbert", device='cpu')

def encode_in_batches(texts, batch_size=16):
    embeddings = []
    for i in tqdm(range(0, len(texts), batch_size)):
        batch = texts[i:i+batch_size]
        emb = model.encode(batch, normalize_embeddings=True, convert_to_numpy=True, show_progress_bar=True)
        embeddings.append(emb)
    return np.vstack(embeddings)

# Prepare training data

texts = [f"{r['system_prompt']} [SEP] {r['user_prompt']} [SEP] {r['response']}" for r in train_aug]
text_embs = encode_in_batches(texts, batch_size=16)

metric_embs_mapped = np.array([metric_embs[metric_map.index(r['metric_name'])] for r in train_aug])
X = np.concatenate([text_embs, metric_embs_mapped], axis=1)
y = np.array([r['score'] for r in train_aug])

X, y = np.array(X, dtype=np.float32), np.array(y, dtype=np.float32)

# Train-validation split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Prepare test data
X_test= []
for r in test:
    txt = f"{r['system_prompt']} [SEP] {r['user_prompt']} [SEP] {r['response']}"
    text_emb = model.encode(txt, normalize_embeddings=True)
    metric_emb = metric_embs[metric_map.index(r['metric_name'])]
    X_test.append(np.concatenate([text_emb, metric_emb]))

X_test = np.array(X_test, dtype=np.float32)

Batches: 100%|██████████| 1/1 [00:07<00:00,  7.52s/it]
Batches: 100%|██████████| 1/1 [00:05<00:00,  5.96s/it]
Batches: 100%|██████████| 1/1 [00:05<00:00,  5.89s/it]
Batches: 100%|██████████| 1/1 [00:05<00:00,  5.88s/it]
Batches: 100%|██████████| 1/1 [00:05<00:00,  5.93s/it]
Batches: 100%|██████████| 1/1 [00:05<00:00,  5.95s/it]
Batches: 100%|██████████| 1/1 [00:05<00:00,  5.93s/it]
Batches: 100%|██████████| 1/1 [00:06<00:00,  6.09s/it]
Batches: 100%|██████████| 1/1 [00:05<00:00,  5.94s/it]
Batches: 100%|██████████| 1/1 [00:05<00:00,  5.97s/it]
Batches: 100%|██████████| 1/1 [00:06<00:00,  6.18s/it]
Batches: 100%|██████████| 1/1 [00:05<00:00,  5.25s/it]
Batches: 100%|██████████| 1/1 [00:05<00:00,  5.89s/it]
Batches: 100%|██████████| 1/1 [00:06<00:00,  6.07s/it]
Batches: 100%|██████████| 1/1 [00:05<00:00,  5.94s/it]
Batches: 100%|██████████| 1/1 [00:06<00:00,  6.08s/it]
Batches: 100%|██████████| 1/1 [00:05<00:00,  5.93s/it]
Batches: 100%|██████████| 1/1 [00:05<00:00,  5.90s/it]
Batches: 1

In [43]:
# Compute bin-based sample weights
bins = np.linspace(0, 10, 21)  # 0-0.5,0.5-1,...,10
bin_idx = np.digitize(y_train, bins) - 1
counts = np.bincount(bin_idx, minlength=len(bins))
weights = 1.0 / (counts[bin_idx] + 1e-6)
weights /= weights.mean()  # normalize mean weight to 1.0

In [44]:
from sklearn.linear_model import Ridge

# Train Ridge regressor
reg = Ridge(alpha=1.0)
reg.fit(X_train, y_train, sample_weight=weights)
pred = reg.predict(X_val)
print("RMSE:", root_mean_squared_error(y_val, pred))

RMSE: 2.190683126449585


In [None]:
from sklearn.linear_model import HuberRegressor

# Train Huber regressor
reg = HuberRegressor(alpha=1e-3, epsilon=1.35, max_iter=1000)
reg.fit(X_train, y_train)
pred = reg.predict(X_val)
print("RMSE:", root_mean_squared_error(y_val, pred))

RMSE: 0.9291471011207187


In [45]:
from sklearn.isotonic import IsotonicRegression

# Calibrate using Isotonic Regression
pred_val = reg.predict(X_val)
iso = IsotonicRegression(out_of_bounds='clip')
iso.fit(pred_val, y_val)
calibrated_val = iso.transform(pred_val)
print("Calibrated RMSE:", root_mean_squared_error(y_val, calibrated_val))

Calibrated RMSE: 0.9182661175727844


## Final CSV for Submission

In [None]:
pred_test = reg.predict(X_test)
pred_test_cal = iso.transform(pred_test)

# Save predictions in csv
df = pd.DataFrame({'ID': [i+1 for i in range(len(test))], 'score': pred_test_cal})
df.to_csv("me22b214.csv", index=False)