In [24]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder # para realizar el Label Encoding 
from sklearn.preprocessing import OneHotEncoder  # para realizar el One-Hot Encoding
import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams["figure.figsize"] = (10,8)
import warnings
warnings.filterwarnings('ignore')

In [25]:
df = pd.read_csv("../ficheros/diamons_estand_sinz.csv", index_col= 0)
df.head(2)

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y
1,-0.777778,Ideal,E,SI2,-1.428571,1.333333,326,-0.965909,-1.0
2,-0.746032,Premium,E,SI1,-3.5,2.666667,326,-0.875,-0.867816


### INFORMACIÓN SOBRE NUESTRAS VARIABLES
---

- **carat** is a measure of diamond weight. One carat is equivalent to 0.2 grams.

- **clarity** refers to how clear a diamond is. Diamonds often contain imperfections like cracks or mineral deposits. The fewer and less noticeable a diamond’s imperfections, the better its clarity. clarity contains 8 ordered levels, from “I1” (the worst) to “IF” (the best).

- **color** refers to the color of the diamond. Colorless diamonds are considered better than diamonds with a yellow tint. Diamonds contains diamonds of 7 different colors, represented by different letters. “D” - “F” diamonds are considered colorless, while “G” - “J” diamonds have a very faint color. This grading is determined by the intensity of the color, and from least to most saturated are graded as faint, very light, light, fancy light, fancy, fancy intense, fancy vivid, fancy dark, or fancy deep. The deeper and more intense the color, the more expensive the diamond will be.

    *Helpful Hint:* Diamond prices decline or increase in alphabetical order. For example, a diamond with a G colour grade is less expensive than a diamond with a D colour grade.

- **cut** refers to how a rough diamond is shaped into a finished diamond. Better cuts create more symmetrical and luminous diamonds. cut has 5 ordered levels: “Fair,” “Good,” “Very Good,” “Premium,” “Ideal.”

- **x**, **y**, **z**, **depth**, and **table** are various measures of a diamond’s size, in millimeters

Vamos a escoger el método map para realizar el encoding porque queremos seleccionar nosotras el orden de las etiquetas según nuestra investigación

In [26]:
df.cut.unique() # El orden según la información que encontramos es de más caro a más barato: "Ideal", "Premium", "Very good", "Good", "Fair"

array(['Ideal', 'Premium', 'Good', 'Very Good', 'Fair'], dtype=object)

In [27]:
# Vamos a hacer el encoding de cut con el método map
dict_cut = {"Fair": 0, "Good": 1, "Very Good": 2, "Premium": 3, "Ideal": 4}

In [28]:
df["cut_map"] = df["cut"].map(dict_cut)

In [29]:
df.sample(6)

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,cut_map
22630,-0.507937,Very Good,I,VS1,-0.285714,-0.333333,10682,-0.517045,-0.505747,2
4702,-0.666667,Ideal,H,I1,-1.214286,1.666667,3678,-0.761364,-0.752874,4
35637,-0.634921,Very Good,H,SI2,0.285714,-1.0,475,-0.715909,-0.706897,2
19181,0.952381,Good,G,SI2,-1.928571,1.0,7933,0.869318,0.913793,1
13198,0.936508,Very Good,H,SI1,0.285714,0.333333,5455,0.767045,0.810345,2
39730,-0.31746,Ideal,E,VS1,0.5,0.333333,1091,-0.3125,-0.275862,4


In [30]:
# Vamos a hacer lo mismo para la variable color
df.color.unique() # El orden sería de más barato a más barato:J, I, H, G, F, E, D

array(['E', 'I', 'J', 'H', 'F', 'G', 'D'], dtype=object)

In [31]:
# Vamos a hacer el encoding de color con el método map
dict_color = {"J": 0, "I": 1, "H": 2, "G": 3, "F": 4, "E": 5, "D": 6}

In [32]:
df["color_map"] = df["color"].map(dict_color)

In [33]:
df.sample(6)

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,cut_map,color_map
8882,0.603175,Good,G,VVS2,0.0,-0.666667,4485,0.556818,0.586207,1,3
43976,-0.746032,Ideal,I,VS1,0.5,0.333333,518,-0.9375,-0.931034,4,1
47199,0.015873,Premium,G,VVS2,-0.5,0.0,1838,0.068182,0.097701,3,3
46742,0.492063,Ideal,F,VS1,0.571429,0.0,1799,0.471591,0.413793,4,4
18909,0.809524,Very Good,J,SI1,-0.428571,0.0,7759,0.744318,0.729885,2,0
39943,-0.619048,Very Good,E,IF,1.142857,-0.666667,492,-0.755682,-0.741379,2,5


In [34]:
df.clarity.unique() # Según la información recopilada, el orden de importe según clarity es (de menor a mayor): I1, SI2, SI1, VS2, VS1, VVS2, VVS1, IF

array(['SI2', 'SI1', 'VS1', 'VS2', 'VVS2', 'VVS1', 'I1', 'IF'],
      dtype=object)

In [35]:
dict_clarity = {"I1": 0, "SI2": 1, "SI1": 2, "VS2": 3, "VS1": 4, "VVS2": 5, "VVS1": 6, "IF":7}

In [36]:
df["clarity_map"] = df["clarity"].map(dict_clarity)

In [37]:
df.sample(6)

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,cut_map,color_map,clarity_map
44075,0.0,Premium,E,VS2,-1.642857,0.333333,1552,0.113636,0.086207,3,5,3
34720,-0.587302,Premium,D,SI1,-0.642857,0.0,874,-0.630682,-0.643678,3,6,2
31610,-0.539683,Very Good,G,VS2,-0.857143,0.333333,369,-0.556818,-0.545977,2,3,3
47301,0.031746,Very Good,J,VS1,0.214286,0.166667,394,0.073864,0.091954,2,0,4
2330,0.619048,Very Good,G,VS2,3.214286,-0.333333,3170,0.5,0.465517,2,3,3
4580,0.507937,Premium,G,SI2,0.5,0.333333,3650,0.471591,0.436782,3,3,1


In [42]:
df.drop(columns=["cut", "color", "clarity"], axis=1, inplace= True)

In [43]:
df.to_csv("../ficheros/diamons_estand_enconding.csv")

Hacemos lo mismo para el csv sin estandarizar

In [38]:
df2= pd.read_csv("../ficheros/diamons_sinz.csv", index_col = 0)
df2.head(2)

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y
1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98
2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84


In [39]:
df2["cut_map"] = df2["cut"].map(dict_cut)
df2["color_map"] = df2["color"].map(dict_color)
df2["clarity_map"] = df2["clarity"].map(dict_clarity)

In [41]:
df2.sample(6)

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,cut_map,color_map,clarity_map
49872,0.53,Ideal,F,VVS2,62.1,56.0,2175,5.15,5.22,4,4,5
39002,0.45,Ideal,G,VS1,59.0,60.0,1055,4.98,5.05,4,3,4
2245,0.7,Good,E,VS1,61.2,58.0,3148,5.66,5.72,1,5,4
1269,0.9,Premium,I,SI2,60.6,60.0,2948,6.28,6.23,3,1,1
53172,0.71,Premium,E,SI1,60.8,60.0,2629,5.75,5.69,3,5,2
718,0.8,Ideal,G,SI2,61.6,56.0,2856,5.97,6.01,4,3,1


In [44]:
df2.drop(columns=["cut", "color", "clarity"], axis=1, inplace= True)

In [45]:
df2.to_csv("../ficheros/diamons_estand_enconding.csv")