# DA5401 2025 Data Challenge

Notebook by: **Devashish Tripathi**

Roll Number: **DA25C006**

Task: Given a metric definition (text embedding) and a prompt-response pair (text), predict the relevance or fitness on a scale of 1-10.

## Setup

**Installations**

In [99]:
# need to upgrade transformers to allow downloading models on Kaggle
# !pip install -U transformers
# restart session after doing so

# !pip install langdetect fasttext
# !pip install knnor-reg

# # to login to huggingface for gemma weights. You need own API key
# import os
# from kaggle_secrets import UserSecretsClient
# user_secrets = UserSecretsClient()
# secret_value_0 = user_secrets.get_secret("HF_TOKEN")

# os.environ["HF_TOKEN"] = secret_value_0

# from huggingface_hub import login

# login(token=os.environ["HF_TOKEN"])

**Imports**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import os
import json
from tqdm.notebook import tqdm

from sentence_transformers import SentenceTransformer, models
from transformers import AutoTokenizer, AutoModel

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.utils.class_weight import compute_sample_weight

from sklearn.decomposition import PCA
from sklearn.metrics import classification_report
from sklearn.utils import resample
from sklearn.metrics import f1_score, make_scorer

In [None]:
from sklearn.linear_model import LinearRegression, Ridge, SGDRegressor, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.neighbors import KNeighborsRegressor, RadiusNeighborsRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier

**Initialisation**

In [3]:
# path to folder containing the data
rootpath = f'./'

# defaults in case of missing information
default_system_prompt = '<NO SYSTEM PROMPT PROVIDED>'
default_response = '<NO RESPONSE PROVIDED>'
default_user_prompt = '<NO USER PROMPT PROVIDED>'
default_metric = '<NO METRIC PROVIDED>'


fillstats = {'system_prompt': default_system_prompt, 'response': default_response, 
            'user_prompt': default_user_prompt, 'metric': default_metric}

# weightage to give to a model's embeddings
weights_vyak, weights_multi, weights_gemma, weights_ai4b = 0.10, 0.05, 0.45, 0.40

### **Loading Sentence Transformer Models**

In [103]:
# Load the Gemma model, made by Google for Gemini, and also used for the metric embeddings in this competition
# model_gemma = SentenceTransformer("google/embeddinggemma-300m")

# Load ai4b-IndicBERT directly. Offers support on 23 Indian Languages
# tokenizer_ai4b = AutoTokenizer.from_pretrained("ai4bharat/IndicBERTv2-MLM-only")
# model_ai4b = AutoModelForMaskedLM.from_pretrained("ai4bharat/IndicBERTv2-MLM-only")
# word_model = models.Transformer("ai4bharat/IndicBERTv2-MLM-only")
# pooling = models.Pooling(word_model.get_word_embedding_dimension())
# model_ai4b = SentenceTransformer(modules=[word_model, pooling])

# !git clone https://huggingface.co/krutrim-ai-labs/vyakyarth
## Vyakyarth by Krutrim-AI offers Indic Language Support
# model_vyak = SentenceTransformer("./vyakyarth")

# !git clone https://huggingface.co/intfloat/multilingual-e5-base
## Multilingual-e5 is trained on a lot of languages and offers multilingual semantic awareness
# model_multi = SentenceTransformer("./multilingual-e5-base")

## Loading and Cleaning the data

### Loading the Data

**Metric related data**

In [4]:
def load_metric_names(rootpath, default_metric = '<NO METRIC PROVIDED>'):
    """
    Loads the metric names and the embeddings and returns them 
    """
    metric_name_embeddings = np.load(f'{rootpath}/metric_name_embeddings.npy')
    metric_name_embeddings = np.append(metric_name_embeddings, np.zeros((1, metric_name_embeddings.shape[1])), axis= 0)
    
    metric_names = []
    with open(f'{rootpath}/metric_names.json') as f:
        metric_names = json.load(f)
    f.close()
    metric_names.append(default_metric)

    print("No. of metrics (incl. Default):", len(metric_names))
    print("Metric name embeddings shape (incl. Default):",metric_name_embeddings.shape)

    return metric_name_embeddings, metric_names

In [5]:
metric_name_embeddings, metric_names = load_metric_names(f'{rootpath}/hackathon_data')

No. of metrics (incl. Default): 146
Metric name embeddings shape (incl. Default): (146, 768)


**Prompt-Response Data**

In [6]:
def extract_promptresponse(rootpath, kind, metric_names):
    """
    Function which extract data from the train/test json file at rootpath.\n
    Returns a dict of ids, metric_name, user_prompt, response and system_prompt\n
    If the data is train data, the label information is also returned.\n
    """
    c = 0
    fpath = f'{rootpath}/{kind}_data.json'
    with open(fpath, encoding='utf-8') as f:
        data = json.load(f)
    f.close()

    n_entries = len(data)
    print(f"{kind} set has {n_entries} entries")

    stats = ['idx', 'metric_name', 'user_prompt', 'response', 'system_prompt']
    if kind == 'train':
        stats.append('score')
    extracted_data = {x: [] for x in stats}

    for idx in range(n_entries):
        extracted_data['idx'].append(idx + 1)
        for stat in stats[1:]:
            if stat != 'metric_name':
                extracted_data[stat].append(data[idx][stat])
            else:
                value = data[idx][stat] if data[idx][stat] in metric_names else None 
                extracted_data[stat].append(value) # Use None if unknown metric comes
    
    return extracted_data

In [7]:
train_data = extract_promptresponse(f'{rootpath}/hackathon_data', 'train', metric_names)
train_df = pd.DataFrame(train_data)
display(train_df.head())

train set has 5000 entries


Unnamed: 0,idx,metric_name,user_prompt,response,system_prompt,score
0,1,rejection_rate,எனக்கு கொஞ்ச நாளா வயித்துல வலிக்குது. என் பிர...,"நான் ஒரு மொழி மாதிரி என்பதால், டாக்டர் இல்லாமல...","You are a medical assistance chatbot, and you ...",10.0
1,2,exaggerated_safety/overblocking_harmless_queries,मैं मनोविज्ञान पर असाइनमेंट लिख रहा हूँ। बता स...,"माफ़ कीजिए, मैं इस विषय पर जानकारी प्रदान नहीं...",You are a healthcare assistant. You must answe...,10.0
2,3,bias_detection/religious_or_cultural_misrepres...,"In some cultures, particularly those with stro...",Healthcare providers must approach this dynami...,,10.0
3,4,confidence_agreement/misalignment_under_safety...,Given the extensive global research and epidem...,Based on decades of extensive global research ...,,10.0
4,5,cultural_sensitivity,जब आधुनिक कृषि पद्धतियों (जैसे उच्च-उपज वाली फ...,आधुनिक कृषि पद्धतियों को पेश करते समय किसानों ...,"You are an agriculture help chatbot, designed ...",9.0


### Data Cleaning

In the training data, multiple rows have system_prompt as None, which is fine as our primary focus is the user_prompt and response. The system_prompt can be replaced with some default value.

One row has a response missing. This needs to be corrected as it is a critical issue

In [10]:
# counting NaNs
missing_counts = train_df.isnull().sum()
print(missing_counts)        

idx                 0
metric_name         0
user_prompt         0
response            1
system_prompt    1549
score               0
dtype: int64


In [11]:
nan_row = train_df[train_df['response'].isna()]
display(nan_row)

Unnamed: 0,idx,metric_name,user_prompt,response,system_prompt,score
3272,3273,toxicity_level/implicit_or_subtle_toxicity,নিকুচি করেছে আমার গাঁদা ফুল চাষের। রোজকার এই প...,,You are a help chatbot who avoids direct slurs...,9.0


Since it is just one response, assuming (with cautious optimism) that the test dataset would follow the same pattern and not have multiple missing entries, we can replace missing response(s) with a default statement

In [12]:
def clean_df(df, fillstats):
    missing_counts = df.isnull().sum()
    print('Missing value before filling in defaults:\n', missing_counts)
    df2 = df.fillna(value= fillstats)
    missing_counts = df2.isnull().sum()
    print('Missing value after filling in defaults:\n', missing_counts)
    return df2

In [13]:
train_df_clean = clean_df(train_df, fillstats) 
y_train =  train_df_clean['score'].to_numpy().astype(np.float64)

Missing value before filling in defaults:
 idx                 0
metric_name         0
user_prompt         0
response            1
system_prompt    1549
score               0
dtype: int64
Missing value after filling in defaults:
 idx              0
metric_name      0
user_prompt      0
response         0
system_prompt    0
score            0
dtype: int64


## Embedding Extraction

**Languages in the data**

To verify the choice of the embedding models. For this purpose, the langcheck model, with the lid.176.ftz weights were used.

In [112]:
# !wget https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.ftz

In [113]:
# import fasttext

# model_path = "./lid.176.ftz"
# lang_model = fasttext.load_model(model_path)

# def detect_language(text):
#     if not isinstance(text, str) or not text.strip():
#         return "unknown"
#     label, prob = lang_model.predict(text.replace("\n", " ").strip())
#     lang = label[0].replace("__label__", "")
#     return lang if prob[0] > 0.6 else "uncertain"

# # checking only from user prompts
# lang_user = train_df["user_prompt"].apply(detect_language).value_counts()

# # mapping from FastText codes to full names
# lang_map = {
#     "hi":"Hindi", "en":"English", "ta":"Tamil", "as":"Assamese", "bn":"Bengali", "mr":"Marathi", 
#     "kn":"Kannada", "te":"Telugu", "gu":"Gujarati", "fi":"Finnish", "pa":"Punjabi", "es":"Spanish",
#     "ur":"Urdu", "ml":"Malayalam", "it":"Italian", "sa":"Sanskrit", "sd":"Sindhi", "nl":"Dutch",
#     "de":"German", "tl":"Tagalog", "eo":"Esperanto", "sw":"Swahili", "id":"Indonesian", "cy":"Welsh",
#     "hu":"Hungarian", "ms":"Malay", "af":"Afrikaans", "ne":"Nepali", "new":"Newari", "ru":"Russian",
#     "ro":"Romanian", "or":"Odia", "fr":"French", "sl":"Slovenian", "kw":"Cornish", "si":"Sinhala",
#     "uz":"Uzbek", "tr":"Turkish", "su":"Sundanese", "lt":"Lithuanian", "als":"Alemannic German",
#     "bpy":"Bishnupriya Manipuri", "hr":"Croatian", "no":"Norwegian", "mai":"Maithili", "ar":"Arabic",
#     "mt":"Maltese", "gom":"Goan Konkani", "sr":"Serbian", "ceb":"Cebuano", "eu":"Basque",
#     "unknown":"Unknown"
# }

# lang_counts = lang_counts.groupby(lang_counts.index.where(lang_counts >= 10, "Other (<10 samples)")).sum()
# lang_counts = lang_counts.sort_values(ascending=False)

# plt.figure(figsize=(12,6))
# bars = plt.bar(lang_counts.index, lang_counts.values, edgecolor='black')
# plt.title("Language Distribution in Dataset", fontsize=14)
# plt.ylabel("Count", fontsize=12)
# plt.xticks(rotation=45, ha='right')
# plt.tight_layout()

# for bar in bars:
#     height = bar.get_height()
#     plt.text(bar.get_x() + bar.get_width()/2, height + 5, f"{int(height)}", 
#              ha='center', va='bottom', fontsize=9)

# plt.show()


For convenience, the plot is showcased here

![The Language Distribution in the train dataset](https://i.postimg.cc/0jb1ZVNm/Screenshot-353.png)

The plot showcases the language distribution. Due to the use of common scripts (such as Devnagri for both Hindi and Bodo), and lack of examples in training data, it is highly likely several unique languages were clubbed together by langfast in the example. Either way, it shows the heavy indic focus, implying any model used for extracting embeddings representation should be indic-based

**Metric Embeddings**

In [14]:
def get_metric_embeddings(df, metric_names, metric_name_embeddings):
    """
    Returns the list of df's corresponding metric embeddings
    """
    embeddings = []
    for metric in df['metric_name']:
        idx = metric_names.index(metric)
        embedding = metric_name_embeddings[idx]
        embeddings.append(embedding)
    return embeddings

In [15]:
train_metric_embs = get_metric_embeddings(train_df_clean, metric_names, metric_name_embeddings)
train_metricembs_np = np.array(train_metric_embs)
print(train_metricembs_np.shape)

(5000, 768)


### Prompt Embeddings

As the task is basically to rate how close the response is to the user prompt keeping the metric and the system prompt in mind, instead of using raw embeddings as features, it is better to create embeddings and find similarities between them to use as features


**Extracting the prompt embeddings**

It is better to enable GPU for just this function

In [16]:
def get_prompt_embeddings(df, model_1, model_2, save= False, kind= 'train', name1= 'vyak', name2= 'multi'):
    """
    Get the prompt emeddings for the user_prompt, system_prompt and response.\n
    Optionally save it.\n
    """
    features = ['user_prompt', 'system_prompt', 'response']

    mod1_results = {feature: 0 for feature in features}
    mod2_results = {feature: 0 for feature in features}
    
    for feature in features:
        print('Feature:', feature)
        print('Model: 1')
        mod1_result = np.array(model_1.encode(df[feature]))
        print('Model: 2')
        mod2_result = np.array(model_2.encode(df[feature]))

        mod1_results[feature] = mod1_result
        mod2_results[feature] = mod2_result
        
        if save:
            np.save(f'{name1}_{feature}_{kind}.npy', mod1_result)    
            np.save(f'{name2}_{feature}_{kind}.npy', mod2_result)    

    return mod1_results, mod2_results    

In [17]:
def load_prompt_embeddings_saved(fpath, modelname, kind= 'train'):
    """
    load path embeddings from provided kaggle path for test or train.
    """
    results = dict()
    features = ['user_prompt', 'system_prompt', 'response']
    
    for feature in features:
        mname = f'{fpath}/{modelname}_{feature}_{kind}.npy'
        results[feature] = np.load(mname)

    return results  

In [None]:
# Generate the embeddings using the model and save them (download them preferably)...
# vyak_results_train, multi_results_train = get_prompt_embeddings(train_df_clean, model_vyak, model_multi, save= True, kind= 'train')
# ai4b_results_train, gemma_results_train = get_prompt_embeddings(train_df_clean, model_ai4b, model_gemma, save= True, kind= 'train', name1= 'ai4b', name2= 'gemma')


# Or used saved models
vyak_results_train = load_prompt_embeddings_saved(f'{rootpath}', 'vyak', 'train')
multi_results_train = load_prompt_embeddings_saved(f'{rootpath}', 'multi', 'train')
ai4b_results_train = load_prompt_embeddings_saved(f'{rootpath}', 'ai4b', 'train')
gemma_results_train = load_prompt_embeddings_saved(f'{rootpath}', 'gemma', 'train')

In [19]:
## sanity check
print(vyak_results_train['user_prompt'].shape)
print(multi_results_train['user_prompt'].shape)
print(ai4b_results_train['user_prompt'].shape)
print(gemma_results_train['user_prompt'].shape)

(5000, 768)
(5000, 768)
(5000, 768)
(5000, 768)


**Combining the embeddings of the models**

As embeddings are of the same dimension, the combination is linear

In [20]:
def combine_model_embeddings(vyak_results, multi_results, ai4b_results, gemma_results, weights_vyak, weights_multi, weights_ai4b, weights_gemma):
    """
    Linearly combine the embeddings from the models and return as a dict
    """
    combined_results = dict()
    for feature in vyak_results.keys():
        combined_results[feature] = vyak_results[feature] * weights_vyak +  multi_results[feature] * weights_multi +  ai4b_results[feature] * weights_ai4b +  gemma_results[feature] * weights_gemma
    return combined_results
        

In [21]:
weights_vyak, weights_multi, weights_gemma, weights_ai4b = 0.5, 0.25, 1, 2
combined_results_train = combine_model_embeddings(vyak_results_train, multi_results_train, ai4b_results_train, gemma_results_train, weights_vyak, weights_multi, weights_ai4b, weights_gemma)

## sanity check
print(combined_results_train['user_prompt'].shape)

(5000, 768)


**Mean of Embeddings as Features**

In [22]:
def take_mean(combined_results):
    for key in combined_results.keys():
        user_prompt = combined_results['user_prompt']
        system_prompt = combined_results['system_prompt']
        response = combined_results['response']
    return np.mean([user_prompt, system_prompt, response], axis= 0)

In [23]:
mean_train_embeddings = take_mean(combined_results_train)
print(mean_train_embeddings.shape)

(5000, 768)


**Similarity Scores as features**

Cosine similarity between the following pairs is used to act as features:
1. user_prompt and response: The key factor which is actually measured by the LLM Judge
2. system_prompt and response: To see if the response aligns with the system_prompt, showcasing how good the LLM is at following commands
3. metric and response: To see if the response fits the metric well, showcasing if the LLM understands the task well
4. metric and user_prompt: To see how close the user prompt is to the intended task, showcasing how much the LLM had penalised (or rewarded) even if the user prompt was not correct

In [24]:
def pairwise_cosine_similarity(a, b):
    """
    Pairwise cosine to avoid sklearn's large matrix calculations
    """
    a_norm = a/np.linalg.norm(a, axis= 1, keepdims= True)
    b_norm = b/np.linalg.norm(b, axis= 1, keepdims= True)
    return np.sum(a_norm*b_norm, axis= 1)

def get_similarity_scores(metric, combined):
    """
    Get cosine similarities as outlined in the above markdown and return as a concatenated nparray
    """
    user_prompt = combined['user_prompt']
    system_prompt = combined['system_prompt']
    response = combined['response']
    metric_norm = metric
    
    feat1 = pairwise_cosine_similarity(user_prompt, response) 
    feat2 = pairwise_cosine_similarity(system_prompt, response) 
    feat3 = pairwise_cosine_similarity(metric_norm, response) 
    feat4 = pairwise_cosine_similarity(metric_norm, user_prompt)
    features = np.vstack([feat1, feat2, feat3, feat4]).T
    return features

In [25]:
train_embedding_features = get_similarity_scores(train_metricembs_np, combined_results_train)
print(train_embedding_features.shape)

(5000, 4)


**TF-IDF as Features**

Regularly used in text embedding tasks, efficient at extracting statistical information

In [26]:
def add_tfidf_features(df_clean, max_feats= 50):
    """Makes TF-IDF Vectorizer from user_prompt and response"""
    combined_text = df_clean['user_prompt']+ ' '+ df_clean['response']
    tfidf_vectorizer = TfidfVectorizer(
        max_features= max_feats,
        ngram_range= (1, 3),
        min_df= 2,
        max_df= 0.95, 
        strip_accents= 'unicode'
    )
    tfidf_vectorizer.fit(combined_text)
    return tfidf_vectorizer

In [27]:
tf_idf_vect = add_tfidf_features(train_df_clean, max_feats= 25)

combined_text = train_df_clean['user_prompt']+ ' ' +train_df_clean['response']
train_tfidf = tf_idf_vect.transform(combined_text)

**Features utilising mean**

In [28]:
def get_mean_based_embeddings(mean_embeddings, metric_embeddings):
    """
    Function to get relation between the means and the metrics
    """
    diff = mean_embeddings - metric_embeddings
    absdiff = np.abs(diff)
    hadamard = mean_embeddings * metric_embeddings
    dot = np.sum(hadamard)
    norms = np.stack([
        np.linalg.norm(hadamard, axis= 1),
        np.linalg.norm(absdiff, axis= 1),
    ])
    return norms.T

In [29]:
X_train_memes = get_mean_based_embeddings(mean_train_embeddings, train_metricembs_np)
X_train_memes.shape

(5000, 2)

### Preparing Final Train Data

Standardising dense features

In [30]:
X_train = np.hstack([mean_train_embeddings, train_embedding_features, train_tfidf.toarray(), X_train_memes])
y_train =  train_df_clean['score'].to_numpy().astype(np.float64)
print(X_train.shape, y_train.shape)

# Bin scores and compute balanced weights
score_bins = np.digitize(y_train, bins=np.arange(0, 11, 1))

# Compute balanced weights (rare scores get higher weight)
sample_weights = compute_sample_weight('balanced', score_bins)

(5000, 799) (5000,)


In [31]:
print('Counts of occurrences in the dataset:')
pd.Series(y_train).value_counts()

Counts of occurrences in the dataset:


9.0     3123
10.0    1442
8.0      259
7.0       95
6.0       45
0.0       13
3.0        7
1.0        6
2.0        5
4.0        3
5.0        1
9.5        1
Name: count, dtype: int64

## Evaluation

Load and Clean Test Data

In [32]:
test_data = extract_promptresponse(f'{rootpath}/hackathon_data', 'test', metric_names)
test_df = pd.DataFrame(test_data)
test_df_clean = clean_df(test_df, fillstats) 
test_metric_embs = get_metric_embeddings(test_df_clean, metric_names, metric_name_embeddings)
test_metricembs_np = np.array(test_metric_embs)

test set has 3638 entries
Missing value before filling in defaults:
 idx                 0
metric_name         0
user_prompt         0
response            1
system_prompt    1106
dtype: int64
Missing value after filling in defaults:
 idx              0
metric_name      0
user_prompt      0
response         0
system_prompt    0
dtype: int64


Get Test Data Embeddings

In [None]:
# Generate the embeddings using the model and save them (also download preferably)...
# vyak_results_test, multi_results_test = get_prompt_embeddings(test_df_clean, model_vyak, model_multi, save= True, kind= 'test')
# ai4b_results_test, gemma_results_test = get_prompt_embeddings(test_df_clean, model_ai4b, model_gemma, save= True, kind= 'test', name1= 'ai4b', name2= 'gemma')


# or use saved versions

vyak_results_test = load_prompt_embeddings_saved(f'{rootpath}', 'vyak', 'test')
multi_results_test = load_prompt_embeddings_saved(f'{rootpath}', 'multi', 'test')
ai4b_results_test = load_prompt_embeddings_saved(f'{rootpath}', 'ai4b', 'test')
gemma_results_test = load_prompt_embeddings_saved(f'{rootpath}', 'gemma', 'test')

In [34]:
combined_results_test = combine_model_embeddings(vyak_results_test, multi_results_test, ai4b_results_test, gemma_results_test, weights_vyak, weights_multi, weights_ai4b, weights_gemma)
test_embedding_features = get_similarity_scores(test_metricembs_np, combined_results_test)

combined_text = test_df_clean['user_prompt']+ ' ' +test_df_clean['response']
test_tfidf = tf_idf_vect.transform(combined_text)

mean_test_embeddings = take_mean(combined_results_test)
X_test_memes = get_mean_based_embeddings(mean_test_embeddings, test_metricembs_np)


X_test = np.hstack([mean_test_embeddings, test_embedding_features, test_tfidf.toarray(), X_test_memes])
print(X_test.shape)

(3638, 799)


In [35]:
pca2 = PCA(n_components= 0.99)
pca2.fit(X_train)
X_train = pca2.transform(X_train)
X_test = pca2.transform(X_test)

In [36]:
print(X_train.shape)

(5000, 523)


Regression

In [36]:
model2 = RandomForestRegressor(random_state= 42, n_estimators= 100, criterion= 'squared_error', ccp_alpha= 0.5, max_features= 'sqrt')
model2.fit(X_train, y_train, sample_weight= sample_weights)
y_train_pred = model2.predict(X_train)
y_train_pred = np.round(y_train_pred, decimals= 1)
print(np.sqrt(mean_squared_error(y_train, y_train_pred)))
y_test_predict = model2.predict(X_test)
y_test_predict = np.round(y_test_predict, decimals= 1)

2.4867705965770144


**Getting Output**

In [37]:
ans = pd.Series(y_test_predict, name= 'score').reset_index(drop=True).to_frame()
ans.index = ans.index + 1
ans.index.name = 'ID'
ans.to_csv('submission.csv', index= True, header= True)
ans

Unnamed: 0_level_0,score
ID,Unnamed: 1_level_1
1,7.1
2,6.6
3,6.0
4,6.9
5,7.0
...,...
3634,6.7
3635,6.4
3636,6.6
3637,7.0


## Experimentation

Everything done to select the optimum models, along with functions, at one place for easy access

**Prediction Clipping**

In [138]:
# def adaptive_clip_predictions(y_pred, train_distribution):
#     """
#     Clip predictions based on test distribution hypothesis
#     """
    
#     # Hypothesis: test has more mid-range scores
#     # Prevent extreme predictions
    
#     # Soft clipping towards center
#     y_clipped = y_pred.copy()
    
#     # Push extreme predictions towards 4-8 range
#     too_high = y_pred > 9.0
#     y_clipped[too_high] = 9.0 + (y_pred[too_high] - 9.0) * 0.3
    
#     too_low = y_pred < 3.0
#     y_clipped[too_low] = 3.0 + (y_pred[too_low] - 3.0) * 0.3
    
#     # Hard clip
#     y_clipped = np.clip(y_clipped, 0, 10)
    
#     return y_clipped

# y_test_pred = adaptive_clip_predictions(y_test_pred, y_train)


**Mahalanobis-2**

In [139]:
# from sklearn.ensemble import VotingRegressor

# # Define tasks based on your distribution
# tasks = {
#     'high_scores': (8, 11),    # Abundant, train with underweight
#     'mid_scores': (6, 8),      # Rare, train with overweight  
#     'low_scores': (0, 6)       # Very rare, train with heavy overweight
# }

# task_models = {}
# task_weights = {}

# for task_name, (low, high) in tasks.items():
#     mask = (y_train >= low) & (y_train < high)
    
#     if mask.sum() < 5:
#         print(f"Skipping {task_name}: insufficient samples ({mask.sum()})")
#         continue
    
#     # Compute task-specific Mahalanobis features
#     X_task = X_train[mask]
#     y_task = y_train[mask]
    
#     # Task-specific covariance
#     from sklearn.covariance import LedoitWolf
#     cov = LedoitWolf().fit(X_task).covariance_
    
#     # Transform using Mahalanobis matrix
#     try:
#         L = np.linalg.cholesky(cov + np.eye(cov.shape[0]) * 1e-4)
#         X_task_transformed = X_task @ L.T
#     except:
#         print(f"Using original features for {task_name}")
#         X_task_transformed = X_task
    
#     # Train task-specific model
#     task_model = RandomForestRegressor(
#         n_estimators=200,
#         max_depth=None,
#         min_samples_split=max(2, len(y_task) // 10),
#         min_samples_leaf=max(1, len(y_task) // 20),
#         random_state=42
#     )
    
#     task_model.fit(X_task_transformed, y_task)
#     task_models[task_name] = (task_model, L if 'L' in locals() else None)
    
#     print(f"{task_name}: {mask.sum()} samples, train RMSE: {np.sqrt(mean_squared_error(y_task, task_model.predict(X_task_transformed))):.3f}")

# # Ensemble prediction with task-aware weighting
# def predict_multitask(X_test_enhanced):
#     all_preds = []
    
#     for task_name, (model, L) in task_models.items():
#         if L is not None:
#             X_transformed = X_test_enhanced @ L.T
#         else:
#             X_transformed = X_test_enhanced
        
#         preds = model.predict(X_transformed)
#         all_preds.append(preds)
    
#     # Weighted average (favor mid_scores model for test)
#     weights = {
#         'high_scores': 0.45,
#         'mid_scores': 0.55,   # Higher weight for test distribution
#         'low_scores': 0.0
#     }
    
#     final_preds = np.zeros(len(X_test_enhanced))
#     for i, task_name in enumerate(task_models.keys()):
#         final_preds += weights[task_name] * all_preds[i]
    
#     return np.clip(final_preds, 0, 10)

# y_test_pred = predict_multitask(X_test)


**Mahalanobis**

In [140]:
# from scipy.spatial.distance import mahalanobis
# from sklearn.covariance import LedoitWolf

# def add_mahalanobis_features(X_train, y_train):
#     """
#     Add Mahalanobis distance to score bin centroids.
#     """
    
#     bins_dict = {
#         'low': (0, 6),      
#         'mid_low': (6, 8),  
#         'mid': (8, 9),      
#         'high': (9, 11)  
#     }
#     centroids = {}
#     inv_covs = {}
    
#     for bin_name, (low, high) in bins_dict.items():
#         mask = (y_train >= low) & (y_train < high)
#         X_bin = X_train[mask]
        
#         if X_bin.shape[0] < 3:
#             # Not enough samples, skip this bin
#             continue
        
#         centroids[bin_name] = X_bin.mean(axis=0)

#         cov_estimator = LedoitWolf().fit(X_bin)
#         cov = cov_estimator.covariance_
        
#         cov_reg = cov + np.eye(cov.shape[0]) * 1e-4
        
#         try:
#             inv_covs[bin_name] = np.linalg.inv(cov_reg)
#         except:
#             print(f"Warning: Singular covariance for {bin_name}, using pseudo-inverse")
#             inv_covs[bin_name] = np.linalg.pinv(cov_reg)

#     return centroids, inv_covs

# def calculate_mahal(X, centroids, inv_covs):
#     mahal_features = []
    
#     for x in X:
#         dists = []
#         for bin_name in sorted(centroids.keys()):
#             try:
#                 dist = mahalanobis(x, centroids[bin_name], inv_covs[bin_name])
#             except:
#                 dist = np.linalg.norm(x - centroids[bin_name])
#             dists.append(dist)
        
#         # Add inverse distances (closer = higher value)
#         inv_dists = [1.0 / (d + 1e-6) for d in dists]
        
#         # Add both raw and inverse distances
#         mahal_features.append(dists + inv_dists)

#     return np.array(mahal_features)

# mahal_train_features = np.hstack([
#         train_embedding_features,
#         train_tfidf.toarray(),
# ])

# centroids_mahal, inv_covs_mahal = add_mahalanobis_features(mahal_train_features, y_train)
# mahal_train = calculate_mahal(mahal_train_features, centroids_mahal, inv_covs_mahal)

**Grid Search**

In [141]:
# linear_grid = {
#     "LinearRegression": {},
#     "Ridge": {'alpha': [0.01, 0.1, 1, 10], 'max_iter': [None, 1000, 10000]},
#     "SGDRegressor": {'penalty': ['l2', 'l1', 'elasticnet'], 'alpha': [1e-4, 1e-2, 1], 'max_iter': [100, 1000, 10000]},
#     "Lasso": {'alpha': [0.01, 0.1, 1, 10], 'max_iter': [1000, 10000]},
#     "ElasticNet": {'alpha': [0.01, 0.1, 1, 10], 'max_iter': [1000, 10000], 'l1_ratio': [0, 0.25, 0.5, 0.75, 1], 
#                    'selection': ['cyclic', 'random']}
# }

# linear_models = {
#     "LinearRegression": LinearRegression(),
#     "Ridge": Ridge(random_state= 42),
#     "SGDRegressor": SGDRegressor(random_state= 42),
#     "Lasso": Lasso(random_state= 42),
#     "ElasticNet": ElasticNet(random_state= 42)
# }

# tree_grid = {
#     "DecisionTreeRegressor": {'criterion': ['squared_error', 'friedman_mse'], 'splitter': ['best', 'random'], 
#                               'ccp_alpha': [0., 0.5, 1.], 'max_features': ['sqrt', 'log2', 1.0]},
#     "RandomForestRegressor": {'n_estimators': [100, 200, 500], 'criterion': ['squared_error', 'friedman_mse'], 
#                               'ccp_alpha': [0., 0.5, 1.], 'max_features': ['sqrt', 'log2', 1.0]},
#     "AdaBoostRegressor": {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 1, 10], 'loss': ['linear', 'square', 'exponential']},
# }

# tree_models = {
#     "RandomForestRegressor": RandomForestRegressor(random_state= 42),
#     "AdaBoostRegressor": AdaBoostRegressor(random_state= 42),
#     "DecisionTreeRegressor": DecisionTreeRegressor(random_state= 42)
# }

# booster_grid = {
#     "LGBMRegressor": {'boosting_type': ['gbdt', 'rf', 'dart'], 'learning_rate': [0.1, 0.01, 1], 'n_estimators': [100, 200, 500],
#                      'reg_alpha': [0., 0.5, 1], 'reg_lambda': [0., 0.5, 1]},
#     "XGBRegressor": {'n_estimators': [100, 200, 500], 'grow_policy': ['depthwise', 'lossguide'], 'learning_rate': [0.01, 0.1, 1],
#                     'booster': ['gbtree', 'gblinear', 'dart'], 'reg_alpha': [0., 0.5, 1], 'reg_lambda': [0., 0.5, 1]},
# }

# booster_models = {
#     "LGBMRegressor": LGBMRegressor(random_state= 42, verbosity= -1),
#     "XGBRegressor": XGBRegressor(random_state= 42),
# }

# other_grid = {
#     "KNeighborsRegressor": {'n_neighbors': [1, 3, 5, 7], 'weights': ['uniform', 'distance'], 'p': [1, 2, 1.5]},
#     "RadiusNeighborsRegressor": {'radius': [1., 2., 1.5], 'weights': ['uniform', 'distance'], 'p': [1, 2, 1.5]},
#     "GaussianProcessRegressor": {},
#     "MLPRegressor": {'hidden_layer_sizes': [[16, 16], [8, 8, 8], [16, 8]], 'activation': ['relu', 'tanh', 'logistic', 'sigmoid'],
#                     'alpha': [1e-4, 1e-3], 'learning_rate': ['constant', 'invscaling', 'adaptive'], 'learning_rate_init': [0.001, 0.01],
#                     'max_iter': [200, 500, 1000]},
#     "SVR": {'kernel': ['linear', 'rbf'], 'gamma': ['scale', 'auto'], 'C': [0.01, 0.1, 1]}
# }

# other_models = {
#     "KNeighborsRegressor": KNeighborsRegressor(),
#     "RadiusNeighborsRegressor": RadiusNeighborsRegressor(),
#     "GaussianProcessRegressor": GaussianProcessRegressor(random_state= 42),
#     "MLPRegressor": MLPRegressor(random_state= 42),
#     "SVR": SVR()
# }

# def run_grid_search(models_dict, grids_dict, X_train, y_train, cv= 5, n_jobs= -1, verbose= 0):
#     """
#     Run GridSearchCV for the specified models in the dict on the specified params
#     cv is the number of cross-validations
#     """

#     requires_scaling = ['LinearRegression', 'Lasso', 'Ridge', 'ElasticNet', 'SGDRegressor',
#                        'KNeighborsRegressor', 'RadiusNeighborsRegressor', 'SVR', 'MLPRegressor',
#                        'GaussianProcessRegressor']
#     results = []
#     scorer = make_scorer(mean_squared_error, greater_is_better= False)
#     scalers = {
#         'StandardScaler': StandardScaler(),
#         'MinMax': MinMaxScaler(),
#         'MaxAbs': MaxAbsScaler()
#     }

#     for name, model in tqdm(models_dict.items(), desc= 'Progress'):
#         tqdm.write(f'GridSearch for {name}')
#         if name in requires_scaling:
#             pipe = Pipeline([
#                 ('scaler', StandardScaler()),
#                 ('model', model)
#             ])
#             param_grid = {
#                 **{f'scaler': list(scalers.values())},
#                 **{f'model__{k}': v for k, v in grids_dict.get(name, {}).items()}
#             }
#         else:
#             pipe = model
#             param_grid = grids_dict.get(name, {})

#         grid = GridSearchCV(estimator= pipe, param_grid= param_grid, cv= cv, 
#                             scoring= scorer, n_jobs= n_jobs, verbose= verbose)
#         grid.fit(X_train, y_train)

#         best_rmse = np.sqrt(-grid.best_score_)
#         results.append({
#             'model': name,
#             'best_rmse': best_rmse,
#             'best_params': grid.best_params_
#         })
#         tqdm.write(f'Best RMSE: {best_rmse:.4f}')
#         tqdm.write(f'Best params: {grid.best_params_}')
#         tqdm.write(f'\n')

#     return results


# ## linear models
# results_linear = run_grid_search(linear_models, linear_grid, X_train, y_train, cv= 5, n_jobs= -1, verbose= 0)


## tree models
# results_tree = run_grid_search(tree_models, tree_grid, X_train, y_train, cv= 5, n_jobs= -1, verbose= 0)

## booster models
# results_booster = run_grid_search(booster_models, booster_grid, X_train, y_train, cv= 5, n_jobs= -1, verbose= 0)


## other models
# results_other = run_grid_search(other_models, other_grid, X_train, y_train, cv= 5, n_jobs= -1, verbose= 0)


# GridSearch for LinearRegression
# Best RMSE: 0.9421
# Best params: {'scaler': StandardScaler()}


# GridSearch for Ridge
# Best RMSE: 0.9420
# Best params: {'model__alpha': 10, 'model__max_iter': None, 'scaler': MinMaxScaler()}


# GridSearch for SGDRegressor
# Best RMSE: 0.9418
# Best params: {'model__alpha': 0.01, 'model__max_iter': 100, 'model__penalty': 'l1', 'scaler': StandardScaler()}


# GridSearch for Lasso
# Best RMSE: 0.9420
# Best params: {'model__alpha': 0.01, 'model__max_iter': 1000, 'scaler': StandardScaler()}


# GridSearch for ElasticNet
# Best RMSE: 0.9419
# Best params: {'model__alpha': 1, 'model__l1_ratio': 0, 'model__max_iter': 1000, 'model__selection': 'cyclic', 'scaler': StandardScaler()}


# GridSearch for KNeighborsRegressor
# Best RMSE: 0.9931
# Best params: {'model__n_neighbors': 7, 'model__p': 1, 'model__weights': 'uniform', 'scaler': MaxAbsScaler()}


# GridSearch for RadiusNeighborsRegressor
# Best RMSE: 0.9420
# Best params: {'model__p': 1, 'model__radius': 1.0, 'model__weights': 'uniform', 'scaler': MaxAbsScaler()}


# GridSearch for GaussianProcessRegressor
# Best RMSE: 1.1256
# Best params: {'scaler': MinMaxScaler()}


# GridSearch for MLPRegressor
# Best RMSE: 0.9418
# Best params: {'model__activation': 'logistic', 'model__alpha': 0.0001, 'model__hidden_layer_sizes': [16, 16], 'model__learning_rate': 'constant', 'model__learning_rate_init': 0.001, 'model__max_iter': 200, 'scaler': StandardScaler()}


# GridSearch for SVR
# Best RMSE: 0.9424
# Best params: {'model__C': 1, 'model__gamma': 'scale', 'model__kernel': 'rbf', 'scaler': MinMaxScaler()}


# GridSearch for RandomForestRegressor
# Best RMSE: 0.9424
# Best params: {'ccp_alpha': 0.5, 'criterion': 'squared_error', 'max_features': 'sqrt', 'n_estimators': 100}


# GridSearch for AdaBoostRegressor
# Best RMSE: 0.9427
# Best params: {'learning_rate': 0.01, 'loss': 'exponential', 'n_estimators': 50}


# GridSearch for DecisionTreeRegressor
# Best RMSE: 0.9424
# Best params: {'ccp_alpha': 0.5, 'criterion': 'squared_error', 'max_features': 'sqrt', 'splitter': 'best'}


# GridSearch for LGBMRegressor
# Best RMSE: 0.9446
# Best params: {'boosting_type': 'gbdt', 'learning_rate': 0.01, 'n_estimators': 100, 'reg_alpha': 1, 'reg_lambda': 0.5}


# GridSearch for XGBRegressor
# Best RMSE: 0.9440
# Best params: {'n_estimators': 200, 'grow_policy': 'lossguide', 'learning_rate': 0.01, 'booster': 'dart', 'reg_alpha': 1, 'reg_lambda': 0.5}

In [142]:


# scorer = make_scorer(f1_score, pos_label=1)

# param_grid = {
#     'n_estimators': [30, 100, 300],
#     'max_depth': [3, 5, 9],
#     'learning_rate': [0.01, 0.1, 0.2],
#     'subsample': [0.7, 0.9, 1.0],
#     'colsample_bytree': [0.7, 1.0],
#     'scale_pos_weight': [ratio, ratio*1.5, ratio*2]
# }

# clf = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42, n_jobs=-1, 
#                    colsample_bytree= 0.7, learning_rate= 0.1, max_depth= 5, n_estimators= 100, scale_pos_weight= ratio,
#                    subsample= 0.9)
# grid = GridSearchCV(clf, param_grid, scoring=scorer, cv=3, verbose=2, n_jobs=-1)


# grid.fit(X_train, is_rare)
# best_model = grid.best_estimator_
# print(grid.best_params_)
# {'colsample_bytree': 0.7,
#  'learning_rate': 0.1,
#  'max_depth': 5,
#  'n_estimators': 100,
#  'scale_pos_weight': 10.52073732718894,
#  'subsample': 0.9}

**Classification into rare and abundant**

In [143]:
# def modify_clf_train_data(X_train, y_train):
# #     """
# #     Returns balanced X_resampled, y_resampled for classification.
# #     """
# #     score_mult = {6: 4, 7: 2, 8: 0.8, 9: 0.06, 10: 0.125}
# #     n = len(X_train)
# #     X_list, y_list = [], []
# #     for score, mult in score_mult.items():
# #         mask = (y_train == score)
# #         cty = int(mask.sum() * mult)
# #         if score == 6 or score == 7:
# #             X_majority, y_majority = X_train[mask == 0], y_train[mask == 0] 
# #             X_minority, y_minority = X_train[mask == 1], y_train[mask == 1]
# #             X_ups, y_ups = resample(X_minority, y_minority, replace= True, n_samples = cty, random_state= 42)
# #             X_list.append(X_ups)
# #             y_list.append(y_ups)
# #         else:
# #             X_majority, y_majority = X_train[mask == 1], y_train[mask == 1] 
# #             X_minority, y_minority = X_train[mask == 0], y_train[mask == 0]
# #             X_dps, y_dps = resample(X_majority, y_majority, replace= False, n_samples = cty, random_state= 42)
# #             X_list.append(X_dps)
# #             y_list.append(y_dps)

# #     np.random.seed(200)
# #     X_resamp, y_resamp = np.vstack(X_list), np.hstack(y_list)
# #     shuffle_ids = np.random.permutation(len(X_resamp))
# #     print(X_resamp.shape, y_resamp.shape)
# #     return X_resamp[shuffle_ids], y_resamp[shuffle_ids]

# X_resamp, y_resamp = modify_clf_train_data(X_train, y_train)

In [144]:
# is_rare = (y_train <= 7).astype(int)
# is_abundant = (y_train >= 8).astype(int)
# ratio = sum(is_rare == 0) / sum(is_rare == 1)
# mask_rare = (is_rare == 1)
# mask_abundant = (is_abundant == 1)

# X_train_rare, y_train_rare = X_train[mask_rare], y_train[mask_rare]
# X_train_abundant, y_train_abundant = X_train[mask_abundant], y_train[mask_abundant]


# resamp_rare = (y_resamp <= 7).astype(int)
# resamp_raremask = (resamp_rare == 1)
# y_rare_resamp = y_resamp[resamp_raremask]


# clf = GradientBoostingClassifier(random_state= 42, n_estimators= 300, max_depth= None)
# # clf.fit(X_train, mask_rare)
# clf.fit(X_resamp, resamp_raremask)

# clf = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42, n_jobs=-1, colsample_bytree= 0.7, 
#                     learning_rate= 0.1, max_depth= 5, n_estimators= 100, scale_pos_weight= ratio, subsample= 0.9)
# clf.fit(X_train, is_rare, sample_weight= sample_weights)

# probs = clf.predict_proba(X_train)[:, 1]
# probs = clf.predict(X_resamp)
# is_rare_pred = (probs > rare_thresh).astype(int)
# print(classification_report(is_rare_pred, is_rare))

# model_rare = RandomForestRegressor(random_state= 42, n_estimators= 300, criterion= 'squared_error', ccp_alpha= 0.5, max_features= 'sqrt')
# model_abundant = RandomForestRegressor(random_state= 42, n_estimators= 100, criterion= 'squared_error', ccp_alpha= 0.5, max_features= 'sqrt')

# model_rare.fit(X_train_rare, y_train_rare)
# model_abundant.fit(X_train_abundant, y_train_abundant)

# print(np.sqrt(mean_squared_error(y_train_rare, model_rare.predict(X_train_rare))))
# print(np.sqrt(mean_squared_error(y_train_abundant, model_abundant.predict(X_train_abundant))))

# def prediction_pipeline(X, clf, rare_thresh, model_rare, model_abundant):
#     # probs = clf.predict_proba(X)[:, 1]
#     probs = clf.predict(X)
#     is_rare_pred = (probs > rare_thresh).astype(int)
#     y_pred = np.zeros(X.shape[0])
#     y_pred[is_rare_pred == 1] = model_rare.predict(X[is_rare_pred == 1])
#     y_pred[is_rare_pred == 0] = model_abundant.predict(X[is_rare_pred == 0])
#     return np.clip(y_pred, 0, 10)

# y_train_pred = prediction_pipeline(X_train, clf, rare_thresh, model_rare, model_abundant)
# print(np.sqrt(mean_squared_error(y_train, y_train_pred)))

# y_test_predict = prediction_pipeline(X_test, clf, rare_thresh, model_rare, model_abundant)

**Base Pipeline**

In [145]:
## Original Pipeline

# # XGBRegressor
# model1 = XGBRegressor(random_state= 42, n_estimators= 200, grow_policy= 'lossguide', learning_rate= 0.01, booster= 'dart', reg_alpha= 1, reg_lambda= 0.5, 
#                       objective='reg:pseudohubererror', huber_slope=2.0,
#                      )
# # MLPRegressor-> Standard Scaler needed
# model3 = MLPRegressor(random_state= 42, activation= 'logistic', alpha= 0.0001, hidden_layer_sizes= [16, 16], learning_rate= 'constant', learning_rate_init= 0.001, max_iter= 200)
# scaler = StandardScaler()
# X_train_sc = scaler.fit_transform(X_train)
# X_test_sc = scaler.transform(X_test)



# model1.fit(X_train, y_train, sample_weight= sample_weights)
# model2.fit(X_resamp, y_resamp)
# model3.fit(X_train_sc, y_train, sample_weight= sample_weights)

# # y_train_pred = model1.predict(X_train) 
# y_resamp_pred = model2.predict(X_resamp) 
# # y_train_pred = model3.predict(X_train_sc) 
# # y_test_predict = model1.predict(X_test)
# # y_test_predict = model3.predict(X_test_sc)


