# 1. Problem Information
- **Name:** [**Famous Paintings**](https://platform.olimpiada-ai.ro/en/problems/81)
- **Date:** 13/02/2026
- **Type:** Regression

# 2. Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.compose import make_column_selector,make_column_transformer
from sklearn.model_selection import cross_val_score,RandomizedSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression,SGDRegressor,Ridge,PoissonRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler,OrdinalEncoder,OneHotEncoder
from sklearn.cluster import KMeans

# 3. Data preparation

In [2]:
cluster_info_model = make_pipeline(StandardScaler(),KMeans(n_clusters=5,n_init=40,random_state=0))

def ClusterInfo(df):
    cluster_cols = ["painter_style_score", "fake_style_score", "complexity", "stroke_density", 
                "colorfulness", "brightness", "contrast", "num_colors", "complexity_x_stroke"]
    if 'target_price' in df.columns:
        cluster_info_model.fit(df[cluster_cols])
    return cluster_info_model.predict(df[cluster_cols])

    
def ProcessData(df):
    df['aspect_ratio'] = df['canvas_size'].apply(lambda x: int(str(x).split("x")[0]) / int(str(x).split("x")[1]))
    df['canvas_size'] = df['canvas_size'].apply(lambda x: int(str(x).split("x")[0]) * int(str(x).split("x")[1]))
    df['image_quality'] = df['image_quality'].map({'low':0,'medium':1,'high':2})
    p1 = (df.stroke_density > 0.7).astype(int)
    p2 = (df.complexity > 0.65).astype(int)
    p3 = (df.uses_gold_leaf == True).astype(int)
    p4 = (df.has_signature == True).astype(int)
    p5 = ((df.num_colors > 65) & (df.colorfulness > 0.7)).astype(int)
    p6 = ((df.contrast  < 0.4) | (df.brightness  < 0.45) | (df.brightness  > 0.75)).astype(int)
    task1 = p1 *2 + p2 *2 + p3 + p4 + p5 *2 - p6
    df['AAS'] = (task1 >= 5).apply(lambda x : 'Autentic' if x else 'Incert')
    df['cluster_group'] = ClusterInfo(df)
    
train = pd.read_csv("data/train.csv")
ProcessData(train)
train = train[[col for col in train.columns if col != 'target_price'] + ['target_price']]
test = pd.read_csv("data/test.csv")
ProcessData(test)

print(train.shape)
train.head(5)

(960, 27)


Unnamed: 0,SampleID,canvas_size,is_oil_painting,brush_type,num_colors,colorfulness,complexity,brightness,contrast,stroke_density,...,auction_house,image_quality,brightness_log,complexity_x_stroke,fake_style_score,painter_style_score,aspect_ratio,AAS,cluster_group,target_price
0,332,3000,True,medium,71,0.61624,0.755582,0.647338,0.587923,0.702118,...,Online,0,0.499161,0.530508,0.349718,0.536437,1.2,Autentic,0,50800
1,410,7200,True,medium,60,0.660715,0.474923,0.538822,0.599076,0.528112,...,Online,0,0.431017,0.250813,0.258722,0.163906,0.888889,Incert,1,37400
2,77,4000,True,fine,64,0.684877,0.380591,0.608029,0.500152,0.508521,...,Sothebys,0,0.475009,0.193539,0.797662,0.137732,1.6,Incert,1,35500
3,869,4000,True,medium,56,0.427938,0.581636,0.562086,0.483896,0.550152,...,Local,0,0.446022,0.319988,0.569981,0.394542,1.6,Incert,3,43200
4,139,10400,True,fine,55,0.481406,0.62978,0.476093,0.493429,0.68171,...,Local,1,0.389398,0.429328,0.723147,0.321858,0.615385,Incert,3,54500


# 4. Models

In [3]:
X = train.iloc[:,1:-1]
Y = train['target_price']
transformer = make_column_transformer((OneHotEncoder(),make_column_selector(dtype_include=object)),remainder=StandardScaler())
pipeline = make_pipeline(transformer,Ridge(random_state=0))
params ={
    'ridge__alpha': [1,0.8,0.5,0.2,0.1],
    'ridge__tol': [0.0001,0.001,0.01,0.1]
}
grid_search = RandomizedSearchCV(pipeline,params,cv=3,scoring='neg_mean_absolute_error')
grid_search.fit(X,Y)

print(grid_search.best_score_)
print(grid_search.best_params_)

-561.448123879838
{'ridge__tol': 0.01, 'ridge__alpha': 0.1}


In [4]:
best_pipeline = grid_search.best_estimator_
predictions = best_pipeline.predict(test.iloc[:,1:])

# 5. Submission

In [5]:
cluster_cols = ["painter_style_score", "fake_style_score", "complexity", "stroke_density", 
            "colorfulness", "brightness", "contrast", "num_colors", "complexity_x_stroke"]
cluster_task2_model = make_pipeline(StandardScaler(),KMeans(n_clusters=5,n_init=40,random_state=0))

task2 =  cluster_task2_model.fit_predict(test[cluster_cols])

In [6]:
df_task1 = pd.DataFrame({
    "SampleID": test.SampleID,
    "subtaskID": ['Task1'] * len(test.SampleID),
    "Answer": test.AAS
})

df_task2 = pd.DataFrame({
    "SampleID": test.SampleID,
    "subtaskID": ['Task2'] * len(test.SampleID),
    "Answer": task2
})
df_task3 = pd.DataFrame({
    "SampleID": test.SampleID,
    "subtaskID": ['Task3'] * len(test.SampleID),
    "Answer": predictions
})

submission = pd.concat([df_task1, df_task2,df_task3])
submission.head()

Unnamed: 0,SampleID,subtaskID,Answer
0,1179,Task1,Incert
1,866,Task1,Autentic
2,102,Task1,Incert
3,440,Task1,Incert
4,59,Task1,Incert


In [7]:
submission.to_csv("submission.csv", index=False)