# ENCODING

In [111]:
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder

import warnings
warnings.filterwarnings('ignore')

In [112]:
df = pd.read_csv("../ficheros/diamons_estandarizados.csv", index_col= 0)
df.head(2)

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,-0.746032,Ideal,E,SI2,-0.214286,-0.666667,326,-0.931818,-0.91954,-0.909091
1,-0.777778,Premium,E,SI1,-1.428571,1.333333,326,-0.965909,-1.0,-1.018182


### INFORMACIÓN SOBRE NUESTRAS VARIABLES
---

- **carat** is a measure of diamond weight. One carat is equivalent to 0.2 grams.

- **clarity** refers to how clear a diamond is. Diamonds often contain imperfections like cracks or mineral deposits. The fewer and less noticeable a diamond’s imperfections, the better its clarity. clarity contains 8 ordered levels, from “I1” (the worst) to “IF” (the best).

- **color** refers to the color of the diamond. Colorless diamonds are considered better than diamonds with a yellow tint. Diamonds contains diamonds of 7 different colors, represented by different letters. “D” - “F” diamonds are considered colorless, while “G” - “J” diamonds have a very faint color. This grading is determined by the intensity of the color, and from least to most saturated are graded as faint, very light, light, fancy light, fancy, fancy intense, fancy vivid, fancy dark, or fancy deep. The deeper and more intense the color, the more expensive the diamond will be.

    *Helpful Hint:* Diamond prices decline or increase in alphabetical order. For example, a diamond with a G colour grade is less expensive than a diamond with a D colour grade.

- **cut** refers to how a rough diamond is shaped into a finished diamond. Better cuts create more symmetrical and luminous diamonds. cut has 5 ordered levels: “Fair,” “Good,” “Very Good,” “Premium,” “Ideal.”

- **x**, **y**, **z**, **depth**, and **table** are various measures of a diamond’s size, in millimeters

Nuestras columnas tienen orden y queremos pasarle nosotras una lista con el orden según lo que hemos investigado, por lo que usaremos map o Ordinal Encoding.

In [113]:
df.cut.unique() # El orden según la información que encontramos es de más caro a más barato: "Ideal", "Premium", "Very good", "Good", "Fair"

array(['Ideal', 'Premium', 'Good', 'Very Good', 'Fair'], dtype=object)

In [114]:
# Vamos a hacer el encoding de cut con el método map.
# Creamos un diccionario donde las keys serán las diferentes categorías que teníamos y los values los valores que le daremos en el encoding
dict_cut = {"Fair": 0, "Good": 1, "Very Good": 2, "Premium": 3, "Ideal": 4}

In [115]:
# Usamos map para cambiarlos
df["cut"] = df["cut"].map(dict_cut)

In [116]:
df.sample(6)

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
14010,-0.619048,3,G,VS2,-2.142857,1.666667,605,-0.670455,-0.678161,-0.763636
17534,0.984127,4,F,SI1,-0.142857,-0.333333,7079,0.863636,0.844828,0.845455
21167,1.269841,3,F,SI2,-0.285714,1.0,9374,0.994318,0.977011,0.963636
19609,1.126984,4,J,VS1,-0.142857,-0.333333,8275,0.909091,0.942529,0.918182
5397,0.238095,4,D,VS2,0.071429,-0.666667,3821,0.272727,0.321839,0.318182
33158,-0.634921,1,G,IF,1.428571,-0.666667,956,-0.744318,-0.770115,-0.645455


In [117]:
# Vamos a hacer lo mismo para la variable color
df.color.unique() # El orden sería de más barato a más barato:J, I, H, G, F, E, D

array(['E', 'I', 'J', 'H', 'F', 'G', 'D'], dtype=object)

In [118]:
# Vamos a hacer el encoding ahora con Ordinal Encoding.
# Le pasamos una lista con el orden de importancia, de menor a mayor
orden_color = ["J", "I", "H", "G", "F", "E", "D"]

In [119]:
# Definimos una función para el Ordinal Encoding
def ordinal_encoder(dataf, columna, orden_valores):
    # Iniciamos el método y aplicamos el fit_transform
    ordinal = OrdinalEncoder(categories= [orden_valores], dtype= int)
    transformados = ordinal.fit_transform(dataf[[columna]])
    # Lo convertimos en dataframe
    df_encod = pd.DataFrame(transformados)
    
    dataf[columna] = df_encod

    return dataf

In [120]:
df = ordinal_encoder(df, "color", orden_color)
df.head(2)

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,-0.746032,4,5,SI2,-0.214286,-0.666667,326,-0.931818,-0.91954,-0.909091
1,-0.777778,3,5,SI1,-1.428571,1.333333,326,-0.965909,-1.0,-1.018182


In [121]:
df.sample(6)

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
11165,0.507937,3,2,VS2,-1.285714,0.333333,4958,0.551136,0.528736,0.445455
46839,-0.31746,2,5,VVS2,0.428571,-0.666667,2275,-0.289773,-0.270115,-0.227273
27707,-0.428571,2,6,SI1,0.785714,-0.333333,760,-0.460227,-0.431034,-0.372727
39118,-0.31746,2,3,VS2,0.142857,0.0,1351,-0.3125,-0.264368,-0.254545
37095,-0.380952,0,6,VS1,3.928571,1.333333,1166,-0.443182,-0.5,-0.2
6023,-0.603175,3,5,SI2,0.428571,0.333333,576,-0.676136,-0.706897,-0.636364


In [122]:
df.clarity.unique() # Según la información recopilada, el orden de importe según clarity es (de menor a mayor): I1, SI2, SI1, VS2, VS1, VVS2, VVS1, IF

array(['SI2', 'SI1', 'VS1', 'VS2', 'VVS2', 'VVS1', 'I1', 'IF'],
      dtype=object)

In [123]:
# Usaremos de nuevo el método map
dict_clarity = {"I1": 0, "SI2": 1, "SI1": 2, "VS2": 3, "VS1": 4, "VVS2": 5, "VVS1": 6, "IF":7}

In [124]:
df["clarity"] = df["clarity"].map(dict_clarity)

In [125]:
df.sample(6)

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
373,0.174603,2,4,2,0.928571,0.666667,2809,0.147727,0.12069,0.218182
36519,-0.460317,4,3,4,-0.5,-0.666667,1121,-0.454545,-0.431034,-0.445455
24079,-0.666667,4,5,5,0.214286,-0.333333,646,-0.795455,-0.787356,-0.745455
39962,-0.15873,3,3,1,-0.714286,0.333333,1433,-0.056818,-0.08046,-0.1
10358,-0.634921,0,4,4,-0.071429,3.0,593,-0.761364,-0.729885,-0.718182
36338,0.0,3,1,0,0.357143,0.333333,1107,0.056818,-0.017241,0.072727


In [126]:
df.to_csv("../ficheros/diamons_estand_enconding.csv")

Hacemos lo mismo para el csv sin estandarizar

In [136]:
df2= pd.read_csv("../ficheros/diamons_01.csv", index_col = 0).reset_index(drop = True)
df2.head(2)

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31


In [137]:
df2["cut"] = df2["cut"].map(dict_cut)
df2["clarity"] = df2["clarity"].map(dict_clarity)

In [138]:
df2.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,4,E,1,61.5,55.0,326,3.95,3.98,2.43
1,0.21,3,E,2,59.8,61.0,326,3.89,3.84,2.31
2,0.23,1,E,4,56.9,65.0,327,4.05,4.07,2.31
3,0.29,3,I,3,62.4,58.0,334,4.2,4.23,2.63
4,0.31,1,J,1,63.3,58.0,335,4.34,4.35,2.75


In [139]:
df2 = ordinal_encoder(df2, "color", orden_color)
df2.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,4,5,1,61.5,55.0,326,3.95,3.98,2.43
1,0.21,3,5,2,59.8,61.0,326,3.89,3.84,2.31
2,0.23,1,5,4,56.9,65.0,327,4.05,4.07,2.31
3,0.29,3,1,3,62.4,58.0,334,4.2,4.23,2.63
4,0.31,1,0,1,63.3,58.0,335,4.34,4.35,2.75


In [140]:
df2.sample(6)

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
48953,0.62,4,3,5,61.8,57.0,2548,5.44,5.47,3.37
47158,0.7,2,4,2,61.3,58.0,2312,5.7,5.72,3.5
44267,0.68,2,5,2,58.2,60.0,1918,5.75,5.79,3.36
26950,0.31,4,6,3,59.0,57.0,734,4.44,4.48,2.63
15492,1.09,4,1,4,61.4,56.0,6225,6.61,6.66,4.07
18576,1.02,1,5,4,63.9,56.0,7602,6.33,6.38,4.06


In [142]:
df2.to_csv("../ficheros/diamons_enconding.csv")