# 1. Problem Information
- **Name:** [**Royal Diamond Store**](https://platform.olimpiada-ai.ro/en/problems/41)
- **Date:** 12/02/2026
- **Type:** Regression

# 2. Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from sklearn.linear_model import LinearRegression,Ridge
from sklearn.svm import SVR
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler,FunctionTransformer
from sklearn.pipeline import make_pipeline
from catboost import CatBoostRegressor

# 3. Data preparation

In [2]:
def ProcessData(df):
    cut_map = {"Fair": 1, "Good": 2, "Very Good": 3, "Premium": 4, "Ideal": 5}
    color_map = {"J": 1, "I": 2, "H": 3, "G": 4, "F": 5, "E": 6, "D": 7}
    clarity_map = {"I3": 1,"I2": 2,"I1": 3,"SI2": 4,"SI1": 5,  "VS2": 6,"VS1": 7,"VVS2": 8,"VVS1": 9,"IF": 10,"FL": 11}
    
    df['volume'] = df['x']*df['y']*df['z']
    df["volume_carat"] = df["volume"] / df["carat"]
    df['proportion'] = df['depth'] / df['table']
    df['cut'] = df['cut'].map(cut_map)
    df['color'] = df['color'].map(color_map)
    df['clarity'] = df['clarity'].map(clarity_map)
    
    df.drop(columns=['x','y','z'],inplace=True)
    
    columns = [col for col in df.columns if col !='price']
    if 'price' in df.columns:
        columns.append('price')
        
    return df[columns]

In [3]:
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")
train = ProcessData(train)
test= ProcessData(test)
print(train.shape)
train.head(5)

(37758, 11)


Unnamed: 0,SampleID,carat,cut,color,clarity,depth,table,volume,volume_carat,proportion,price
0,19498,1.21,5,3,8,61.3,57.0,201.095892,166.194952,1.075439,8131
1,31230,0.31,5,6,6,62.0,56.0,51.752328,166.942994,1.107143,756
2,22312,1.21,5,6,7,62.4,57.0,195.4746,161.549256,1.094737,10351
3,279,0.81,5,5,4,62.6,55.0,131.253504,162.041363,1.138182,2795
4,6647,0.79,5,2,8,61.7,56.0,129.70881,164.188367,1.101786,4092


In [4]:
train.describe().round(3)

Unnamed: 0,SampleID,carat,cut,color,clarity,depth,table,volume,volume_carat,proportion,price
count,37758.0,37758.0,37758.0,37758.0,37758.0,37758.0,37758.0,37758.0,37758.0,37758.0,37758.0
mean,27023.205,0.8,3.904,4.404,6.051,61.745,57.461,130.141,163.298,1.076,3951.495
std,15562.754,0.476,1.115,1.7,1.645,1.429,2.227,79.134,12.864,0.053,4006.428
min,1.0,0.2,1.0,1.0,3.0,43.0,43.0,0.0,0.0,0.617,326.0
25%,13567.25,0.4,3.0,3.0,5.0,61.0,56.0,65.327,161.559,1.046,956.0
50%,26974.5,0.7,4.0,4.0,6.0,61.8,57.0,114.84,163.473,1.082,2404.0
75%,40527.5,1.04,5.0,6.0,7.0,62.5,59.0,171.109,165.328,1.112,5366.75
max,53940.0,5.01,5.0,7.0,10.0,79.0,95.0,3840.598,1920.299,1.463,18818.0


# 4. Models

In [5]:
X = train.iloc[:,1:-1]
Y = train['price']
pipeline = make_pipeline(StandardScaler(),CatBoostRegressor(random_state=0,max_depth=6,iterations=2000,eval_metric='MAE',verbose=1000))
score = cross_val_score(pipeline,X,Y,cv=3,scoring='neg_mean_absolute_error')
print("Score is:",score.mean()*-1)

Learning rate set to 0.038796
0:	learn: 2943.0166125	total: 148ms	remaining: 4m 55s
1000:	learn: 252.5288168	total: 4.2s	remaining: 4.19s
1999:	learn: 229.7169866	total: 8.36s	remaining: 0us
Learning rate set to 0.038796
0:	learn: 2919.8268909	total: 5.79ms	remaining: 11.6s
1000:	learn: 254.5274388	total: 4.17s	remaining: 4.16s
1999:	learn: 231.8751725	total: 8.19s	remaining: 0us
Learning rate set to 0.038796
0:	learn: 2952.4945365	total: 2.63ms	remaining: 5.26s
1000:	learn: 255.9081892	total: 3.55s	remaining: 3.54s
1999:	learn: 232.8038169	total: 7.1s	remaining: 0us
Score is: 277.50115772281407


In [6]:
stronger_pipeline = make_pipeline(StandardScaler(),CatBoostRegressor(random_state=0,max_depth=12,iterations=4000,eval_metric='MAE',verbose=1000))
stronger_pipeline.fit(X,Y)
prediction = stronger_pipeline.predict(test.iloc[:,1:])

Learning rate set to 0.023544
0:	learn: 2982.3923763	total: 55.6ms	remaining: 3m 42s
1000:	learn: 221.2169494	total: 48.8s	remaining: 2m 26s
2000:	learn: 185.7421295	total: 1m 35s	remaining: 1m 35s
3000:	learn: 161.4528252	total: 2m 19s	remaining: 46.4s
3999:	learn: 144.1780998	total: 3m 2s	remaining: 0us


In [7]:
def CaratClassify(val):
    if val < 0.5:
        return "Light"
    if val < 1.5:
        return "Medium"
    return "Heavy"
    
task1 = test['carat'].apply(CaratClassify)

# 5. Submission

In [8]:
df_task1 = pd.DataFrame({
    "subtaskID": [1]*len(test['SampleID']),
    "datapointID": test['SampleID'],
    "answer": task1
})
df_task2 = pd.DataFrame({
    "subtaskID": [2]*len(test['SampleID']),
    "datapointID": test['SampleID'],
    "answer": test['proportion']
})
df_task3 = pd.DataFrame({
    "subtaskID": [3]*len(test['SampleID']),
    "datapointID": test['SampleID'],
    "answer": test['volume']
})
df_task4 = pd.DataFrame({
    "subtaskID": [4]*len(test['SampleID']),
    "datapointID": test['SampleID'],
    "answer": prediction
})

submission = pd.concat([df_task1, df_task2, df_task3, df_task4])
submission.head()

Unnamed: 0,subtaskID,datapointID,answer
0,1,1389,Light
1,1,50053,Medium
2,1,41646,Light
3,1,42378,Light
4,1,17245,Heavy


In [9]:
submission.to_csv("submission.csv", index=False)