Libraries and Dependencies

In [None]:
import MyPipe as mp
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

Function definition

In [None]:
def preprocess_data(raw_data: pd.DataFrame):
    data = raw_data.loc[(raw_data.x * raw_data.y * raw_data.z != 0) & (raw_data.price > 0)] # Clean zero dimensions and negative prices
    data = data.drop(columns=['depth', 'table', 'y', 'z']) # Drop not usefull columns
    data_dummy = pd.get_dummies(data, columns=['cut', 'color', 'clarity'], drop_first=True) # Compute dummies columns
    return data_dummy

def split_data(data: pd.DataFrame, test_size=0.2, random_state=42, apply_ylog = False):
    x = data.drop(columns='price')
    y = data.price
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_size, random_state=random_state)
    if (apply_ylog):
        y_train = np.log(y_train)
        y_test = np.log(y_test)
    return x_train, x_test, y_train, y_test
    
def plot_gof(y_true: pd.Series, y_pred: pd.Series):
    plt.plot(y_true, y_pred, '.')
    plt.plot(y_true, y_true, linewidth=3, c='black')
    plt.xlabel('Actual')
    plt.ylabel('Predicted')
    plt.show()

C1.1 Example on how to use the class MyPipe with the whole dataset (no data acquisition)

In [None]:
diamonds = pd.read_csv("https://raw.githubusercontent.com/xtreamsrl/xtream-ai-assignment-engineer/main/datasets/diamonds/diamonds.csv")

data = preprocess_data(diamonds)
x_train, x_test, y_train, y_test = split_data(data, apply_ylog=True)

In [None]:
my_pipeline = mp.MyPipe()
my_pipeline.define_data(data)
my_pipeline.fit(x_train,y_train)

pred = my_pipeline.predict(x_test)
performance = my_pipeline.evaluate_performance(np.exp(y_test), np.exp(pred))

In [None]:
performance

In [None]:
plot_gof(np.exp(y_test), np.exp(pred))

C1.2 Simulate data acquisition

In [None]:
half = int(len(data.index)/2)
data0 = data[:half] # Suppose we know half of the data at the beginning of the procedure
batch_size = 100 #Number of new diamonds in each batch of new data
data_new = [] # List of the data coming in batch at every update
i = 0
while (half+batch_size*(i+1)<len(data.index)):
    data_new.append(data[half+batch_size*i:half+batch_size*(i+1)])
    i +=1
data_new.append(data[half+batch_size*i:])

# Define the first model with half of the data
x_train, x_test, y_train, y_test = split_data(data0, apply_ylog=True)

my_pipeline = mp.MyPipe()
my_pipeline.define_data(data0)
my_pipeline.fit(x_train,y_train)

pred = my_pipeline.predict(x_test)
performance = my_pipeline.evaluate_performance(np.exp(y_test), np.exp(pred))
my_pipeline.dump('../data/models_history/linear_model/lin_0.pkl') # save the pipeline to file

for n, current_data in enumerate(data_new):
    print(f'Batch {n+1} of {len(data_new)}')
    my_pipeline.augment_data(current_data)
    x_train, x_test, y_train, y_test = split_data(my_pipeline.data, apply_ylog=True)
    my_pipeline.fit(x_train, y_train)
    pred = my_pipeline.predict(x_test)
    performance = my_pipeline.evaluate_performance(np.exp(y_test), np.exp(pred))
    my_pipeline.dump(f'../data/models_history/linear_model/lin_{n+1}.pkl') # save the pipeline to file
    

In [None]:
my_pipeline.plot_history(trendline=True)