Libraries and Dependencies

In [12]:
from typing import Optional
from datetime import datetime
import pandas as pd
import xgboost
import json
from io import StringIO

Definition of the API methods

In [13]:
model = xgboost.XGBRegressor()
model.load_model('../data/state_of_the_art/xgb_opt.json')

def preprocess_data(raw_data: pd.DataFrame):
    raw_data = raw_data.loc[(raw_data.x * raw_data.y * raw_data.z != 0) & (raw_data.price > 0)] # Clean zero dimensions and negative prices
    processed_data = raw_data.copy()
    processed_data['cut'] = pd.Categorical(processed_data['cut'], categories=['Fair', 'Good', 'Very Good', 'Ideal', 'Premium'], ordered=True)
    processed_data['color'] = pd.Categorical(processed_data['color'], categories=['D', 'E', 'F', 'G', 'H', 'I', 'J'], ordered=True)
    processed_data['clarity'] = pd.Categorical(processed_data['clarity'], categories=['IF', 'VVS1', 'VVS2', 'VS1', 'VS2', 'SI1', 'SI2', 'I1'], ordered=True)
    return processed_data

def preprocess_input(raw_input: pd.DataFrame):
    processed_input = raw_input.copy()
    processed_input['cut'] = pd.Categorical(processed_input['cut'], categories=['Fair', 'Good', 'Very Good', 'Ideal', 'Premium'], ordered=True)
    processed_input['color'] = pd.Categorical(processed_input['color'], categories=['D', 'E', 'F', 'G', 'H', 'I', 'J'], ordered=True)
    processed_input['clarity'] = pd.Categorical(processed_input['clarity'], categories=['IF', 'VVS1', 'VVS2', 'VS1', 'VS2', 'SI1', 'SI2', 'I1'], ordered=True)
    return processed_input

dataset = pd.read_csv("https://raw.githubusercontent.com/xtreamsrl/xtream-ai-assignment-engineer/main/datasets/diamonds/diamonds.csv")
dataset = preprocess_data(dataset)

def predict(carat: float, cut: str, color: str, clarity: str, depth: float, table: float, x: float, y: float, z: float):
    diamond = {
        "carat": carat,
        "cut": cut,
        "color": color,
        "clarity": clarity,
        "depth": depth,
        "table": table,
        "x": x,
        "y": y,
        "z": z
    }
    my_diamond = pd.DataFrame.from_dict([diamond])
    my_diamond = preprocess_input(my_diamond)
    value = round(float(model.predict(my_diamond)[0]),2)
    try:
        with open('../data/api_calls/call_log.json', 'r') as read_file:
            call_log = json.load(read_file)
    except IOError:
        call_log = []
    call_log.append({"Date-Time": datetime.now().strftime("%Y/%m/%d, %H:%M:%S"), "method": "predict", "request": diamond, "response": value})
    with open('../data/api_calls/call_log.json', 'w') as fout:
        json.dump(call_log , fout)
    return value

def find_similar(n: int, carat: float, cut: str, color: str, clarity: str, depth: Optional[float] = None, table: Optional[float] = None, x: Optional[float] = None, y: Optional[float] = None, z: Optional[float] = None):
    query = {
        "n": n,
        "carat": carat,
        "cut": cut,
        "color": color,
        "clarity": clarity
    }
    result = dataset.loc[(dataset.cut==cut) & (dataset.color==color) & (dataset.clarity==clarity)].copy()
    result['diff'] = abs(result.loc[:,'carat'] - carat)
    result_sorted = result.sort_values(by='diff')
    response = result_sorted.drop(columns=['diff']).head(n).to_json()
    try:
        with open('../data/api_calls/call_log.json', 'r') as read_file:
            call_log = json.load(read_file)
    except IOError:
        call_log = []
    call_log.append({"Date-Time": datetime.now().strftime("%Y/%m/%d, %H:%M:%S"), "method": "find-similar", "request": query, "response": response})
    with open('../data/api_calls/call_log.json', 'w') as fout:
        json.dump(call_log , fout)
    return response

Test the methods

In [14]:
my_carat = 1.10
my_cut = "Ideal"
my_color = "H"
my_clarity = "SI2"
my_depth = 62.0
my_table = 55.0
my_x = 6.61
my_y = 6.65
my_z = 4.11

predict(my_carat, my_cut, my_color, my_clarity, my_depth, my_table, my_x, my_y, my_z)


4585.49

In [15]:
target_carat = 1.10
target_cut = "Ideal"
target_color = "H"
target_clarity = "SI2"
n = 10
result = find_similar(n, target_carat, target_cut, target_color, target_clarity)
pd.read_json(StringIO(result))

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,1.1,Ideal,H,SI2,62.0,55,4733,6.61,6.65,4.11
2246,1.11,Ideal,H,SI2,62.0,54,4578,6.61,6.64,4.11
4954,1.11,Ideal,H,SI2,61.5,55,4883,6.65,6.69,4.1
3621,1.09,Ideal,H,SI2,61.5,55,4212,6.65,6.68,4.1
2632,1.07,Ideal,H,SI2,59.2,57,4314,6.7,6.64,3.95
4393,1.07,Ideal,H,SI2,62.4,55,4840,6.56,6.59,4.1
2044,1.06,Ideal,H,SI2,62.8,57,4452,6.57,6.49,4.1
2299,1.06,Ideal,H,SI2,62.8,57,4402,6.52,6.56,4.11
2682,1.05,Ideal,H,SI2,60.6,57,3234,6.59,6.55,3.98
3754,1.03,Ideal,H,SI2,60.6,56,4283,6.53,6.63,3.99


View the log of the last call made

In [16]:
with open('../data/api_calls/call_log.json') as f:
    for line in f:
        call_log=json.loads(line)
call_log[-1]

{'Date-Time': '2024/07/04, 15:34:30',
 'method': 'find-similar',
 'request': {'n': 10,
  'carat': 1.1,
  'cut': 'Ideal',
  'color': 'H',
  'clarity': 'SI2'},
 'response': '{"carat":{"0":1.1,"2246":1.11,"4954":1.11,"3621":1.09,"2632":1.07,"4393":1.07,"2044":1.06,"2299":1.06,"2682":1.05,"3754":1.03},"cut":{"0":"Ideal","2246":"Ideal","4954":"Ideal","3621":"Ideal","2632":"Ideal","4393":"Ideal","2044":"Ideal","2299":"Ideal","2682":"Ideal","3754":"Ideal"},"color":{"0":"H","2246":"H","4954":"H","3621":"H","2632":"H","4393":"H","2044":"H","2299":"H","2682":"H","3754":"H"},"clarity":{"0":"SI2","2246":"SI2","4954":"SI2","3621":"SI2","2632":"SI2","4393":"SI2","2044":"SI2","2299":"SI2","2682":"SI2","3754":"SI2"},"depth":{"0":62.0,"2246":62.0,"4954":61.5,"3621":61.5,"2632":59.2,"4393":62.4,"2044":62.8,"2299":62.8,"2682":60.6,"3754":60.6},"table":{"0":55.0,"2246":54.0,"4954":55.0,"3621":55.0,"2632":57.0,"4393":55.0,"2044":57.0,"2299":57.0,"2682":57.0,"3754":56.0},"price":{"0":4733,"2246":4578,"4954"