## Imports

In [None]:
import warnings
import duckdb
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.neighbors import NearestNeighbors
import umap.umap_ as umap

## Functions

In [20]:
def encoder(df):
    df_enc = df.copy()

    # Obtain the dataframe encoded
    for column in df.columns:
        if df[column].dtype == 'object':
            enc_label = LabelEncoder()
            df_enc[column] = enc_label.fit_transform(df[column])
    return df_enc

def remove_duplicates(df):
    df = df.drop_duplicates()
    return df

## Import data

In [26]:
# create a connection to a file called 'file.db'
con = duckdb.connect("../data/train/diamonds_train.db")

# Query to extract data from database
query_full = """
SELECT
    --tra.index_id,
    cut.cut,
    col.color,
    cla.clarity,
    tra.price,
    cit.city,
    tra.carat,
    dim.depth,
    dim.table,
    dim.x,
    dim.y,
    dim.z
FROM diamonds_properties AS pro
JOIN diamonds_cut AS cut ON pro.cut_id = cut.cut_id
JOIN diamonds_color AS col ON pro.color_id = col.color_id
JOIN diamonds_clarity AS cla ON pro.clarity_id = cla.clarity_id
JOIN diamonds_transactional as tra ON pro.index_id = tra.index_id
JOIN diamonds_city AS cit ON tra.city_id = cit.city_id
JOIN diamonds_dimensions AS dim ON pro.index_id = dim.index_id
"""

diamond_train_df = con.execute(query_full).df()
diamond_train_df = encoder(diamond_train_df)
y_train = diamond_train_df['price']
diamond_train_df_2 = diamond_train_df.drop('price',axis = 1)
diamond_train_df_2.head()

Unnamed: 0,cut,color,clarity,city,carat,depth,table,x,y,z
0,3,6,5,2,1.21,62.4,58.0,6.83,6.79,4.25
1,4,4,5,3,0.32,63.0,57.0,4.35,4.38,2.75
2,0,3,4,4,0.71,65.5,55.0,5.62,5.53,3.65
3,1,0,2,3,0.41,63.8,56.0,4.68,4.72,3.0
4,2,3,2,2,1.02,60.5,59.0,6.55,6.51,3.95


In [15]:
diamond_test_df = pd.read_csv("../data/test/diamonds_test.csv")
diamond_test_df = encoder(diamond_test_df)
diamond_test_df = diamond_test_df.drop('id',axis = 1)

column_order = ['cut', 'color', 'clarity', 'city', 'carat', 'depth', 'table', 'x', 'y', 'z']
diamond_test_df = diamond_test_df[column_order]
diamond_test_df.head()

Unnamed: 0,cut,color,clarity,city,carat,depth,table,x,y,z
0,4,2,2,0,0.79,62.7,60.0,5.82,5.89,3.67
1,2,6,4,10,1.2,61.0,57.0,6.81,6.89,4.18
2,3,4,2,3,1.57,62.2,61.0,7.38,7.32,4.57
3,4,2,2,3,0.9,63.8,54.0,6.09,6.13,3.9
4,4,2,4,0,0.5,62.9,58.0,5.05,5.09,3.19


## KNN
Find the nearest neigthbors with KNN model.

In [56]:
warnings.filterwarnings("ignore")

# Escalado de características
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(diamond_train_df_2)
X_test_scaled = scaler.transform(diamond_test_df)

# Encuentra los diamantes más parecidos en el segundo dataframe (diamond_train_df) para cada diamante en el primer dataframe (diamond_test_df)
k_neighbors = 1  # Número de vecinos más cercanos a encontrar
neigh = NearestNeighbors(n_neighbors=k_neighbors)
neigh.fit(X_train_scaled)
distances, indices = neigh.kneighbors(X_test_scaled)

# Crea un nuevo dataframe para almacenar los diamantes más parecidos
similar_diamonds_df = diamond_train_df_2.iloc[indices.reshape(-1)]

# Resetea el índice para que los índices sean únicos en el nuevo dataframe
similar_diamonds_df.reset_index(drop=True, inplace=True)

# Añade los precios correspondientes a cada diamante en el resultado final
similar_diamonds_df['price'] = diamond_train_df.loc[indices.flatten(), 'price'].values

# Muestra el nuevo dataframe
#print("Diamantes más parecidos en diamond_train_df para cada diamante en diamond_test_df:")
#print(similar_diamonds_df)

In [57]:
similar_diamonds_df

Unnamed: 0,cut,color,clarity,city,carat,depth,table,x,y,z,price
0,4,2,2,0,0.90,62.6,60.0,6.10,6.14,3.83,3950
1,2,6,4,10,1.14,61.5,57.0,6.70,6.73,4.13,5392
2,3,4,2,3,1.52,62.0,60.0,7.36,7.28,4.54,7559
3,4,2,2,3,0.90,63.3,55.0,6.05,6.08,3.84,4004
4,4,2,4,1,0.51,63.3,57.0,5.06,5.09,3.21,1627
...,...,...,...,...,...,...,...,...,...,...,...
13480,2,1,2,0,0.57,61.9,56.0,5.32,5.35,3.30,1608
13481,2,5,5,8,0.73,62.1,54.0,5.79,5.80,3.60,2327
13482,2,2,4,12,0.70,61.5,55.0,5.73,5.75,3.53,3319
13483,4,2,3,10,0.71,59.0,58.0,5.84,5.89,3.46,2098


In [52]:
df_new = remove_duplicates(similar_diamonds_df)

In [53]:
df_new.shape

(24527, 11)

In [54]:
df_new.head()

Unnamed: 0,cut,color,clarity,city,carat,depth,table,x,y,z,price
0,4,2,2,0,0.9,62.6,60.0,6.1,6.14,3.83,3950
1,4,2,2,1,0.81,63.1,59.0,5.85,5.79,3.67,2809
2,4,3,2,0,0.81,62.5,60.0,5.89,5.94,3.69,2806
3,2,6,4,10,1.14,61.5,57.0,6.7,6.73,4.13,5392
4,2,6,4,10,1.33,61.3,57.0,7.11,7.08,4.35,6118


In [55]:
df_new.to_csv('diamond_train_df_Nearest_all_features_3knn.csv', index=False)

## UMAP and KNN
First, the UMAP model is used to dimensionally reduce the features and then the KNN model is used to find the most similar diamonds to the diamond_test_df dataset.

In [71]:
warnings.filterwarnings("ignore")

# Escalado de características
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(diamond_train_df_2)
X_test_scaled = scaler.transform(diamond_test_df)

# UMAP para reducción de dimensionalidad
umap_model = umap.UMAP(n_neighbors=5, min_dist=0.1, metric='euclidean')
umap_model.fit(X_train_scaled)
X_train_umap = umap_model.transform(X_train_scaled)
X_test_umap = umap_model.transform(X_test_scaled)

In [75]:
# Encuentra los diamantes más parecidos en el segundo dataframe (diamond_train_df) para cada diamante en el primer dataframe (diamond_test_df)
k_neighbors = 1  # Número de vecinos más cercanos a encontrar
neigh = NearestNeighbors(n_neighbors=k_neighbors)
neigh.fit(X_train_umap)
distances, indices = neigh.kneighbors(X_test_umap)

# Crea un nuevo dataframe para almacenar los diamantes más parecidos
similar_diamonds_df = diamond_train_df_2.iloc[indices.reshape(-1)]

# Resetea el índice para que los índices sean únicos en el nuevo dataframe
similar_diamonds_df.reset_index(drop=True, inplace=True)

# Añade los precios correspondientes a cada diamante en el resultado final
similar_diamonds_df['price'] = diamond_train_df.loc[indices.flatten(), 'price'].values

In [78]:
similar_diamonds_df.head()
similar_diamonds_df.shape
similar_diamonds_df.to_csv('diamond_train_df_Nearest_all_features_1knn_umap.csv', index=False)

In [69]:
df_new = remove_duplicates(similar_diamonds_df)

In [74]:
df_new.shape

(24817, 11)