## Diamonds
### Analyze diamonds by their cut, color, clarity, price, and other attributes

In [3]:
import numpy as np
import pandas as pd
import sklearn

import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [None]:
df_diamonds = pd.read_csv('datasets/diamonds.csv')
df_diamonds.info()

In [None]:
df_diamonds.drop(columns={"Unnamed: 0"}, inplace=True)

In [None]:
df_diamonds.describe(include='all')

In [None]:
df_diamonds.head(10)

## Attributes

* ***price*** price in US dollars (\$326--\$18,823)

* ***carat*** weight of the diamond (0.2--5.01)

* ***cut*** quality of the cut (Fair, Good, Very Good, Premium, Ideal)

* ***color*** diamond colour, from J (worst) to D (best)

* ***clarity*** a measurement of how clear the diamond is (I1 (worst), SI2, SI1, VS2, VS1, VVS2, VVS1, IF (best))

* ***x*** length in mm (0--10.74)

* ***y*** width in mm (0--58.9)

* ***z*** depth in mm (0--31.8)

* ***depth*** total depth percentage = z / mean(x, y) = 2 * z / (x + y) (43--79)

* ***table*** width of top of diamond relative to widest point (43--95)

### Exibe as 10 primeiras linhas do dataset

In [None]:
df_diamonds.head(10)

### Os ultimos 10 

In [None]:
df_diamonds.tail(10)

### Existe valor nulo

In [None]:
df_diamonds.isnull().sum()

### Type class cut | color | clarity

In [None]:
print('CUT: ', df_diamonds['cut'].unique())
print('COLOR: ', df_diamonds['color'].unique())
print('CLARITY: ', df_diamonds['clarity'].unique())

### Exploratory Analysis

In [None]:
print('Attr: Carat')
df_attr = df_diamonds["carat"]
print(f'Max : {df_attr.max()}')
print(f'Min : {df_attr.min()}')
print(f'Variance: {df_attr.var()}')
print(f'Std : {df_attr.std()}')
print(f'mean : {df_attr.mean()}')
print(f'median : {df_attr.median()}')
print(f'mode: {df_attr.mode()}')
print(f'1 quartile 25%: {df_attr.quantile(q=0.25)}')
print(f'2 quartile 50%: {df_attr.quantile(q=0.50)}')
print(f'3 quartile 75%: {df_attr.quantile(q=0.75)}\n')
print(f'IQR: {df_attr.quantile(q=0.75)-df_attr.quantile(q=0.25)}\n')
print('Histogram = Positive skewed | prox normal')
df_attr.hist(figsize=(5,3), bins=50)

In [None]:
print('Attr: Depth')
df_attr = df_diamonds["depth"]
print(f'Max : {df_attr.max()}')
print(f'Min : {df_attr.min()}')
print(f'Variance: {df_attr.var()}')
print(f'Std : {df_attr.std()}')
print(f'mean : {df_attr.mean()}')
print(f'median : {df_attr.median()}')
print(f'mode: {df_attr.mode()}')
print(f'1 quartile 25%: {df_attr.quantile(q=0.25)}')
print(f'2 quartile 50%: {df_attr.quantile(q=0.50)}')
print(f'3 quartile 75%: {df_attr.quantile(q=0.75)}\n')
print('Histogram = Positive skewed')

df_attr.hist(figsize=(5,3), bins=50)

In [None]:
print('Attr: Table')
df_attr = df_diamonds["table"]
print(f'Max : {df_attr.max()}')
print(f'Min : {df_attr.min()}')
print(f'Variance: {df_attr.var()}')
print(f'Std : {df_attr.std()}')
print(f'mean : {df_attr.mean()}')
print(f'median : {df_attr.median()}')
print(f'mode: {df_attr.mode()}')
print(f'1 quartile 25%: {df_attr.quantile(q=0.25)}')
print(f'2 quartile 50%: {df_attr.quantile(q=0.50)}')
print(f'3 quartile 75%: {df_attr.quantile(q=0.75)}\n')
print('Histogram = Positive skewed')
df_attr.hist(figsize=(5,3), bins=50)

In [None]:
print('Attr: Price')
df_attr = df_diamonds["price"]
print(f'Max : {df_attr.max()}')
print(f'Min : {df_attr.min()}')
print(f'Variance: {df_attr.var()}')
print(f'Std : {df_attr.std()}')
print(f'mean : {df_attr.mean()}')
print(f'median : {df_attr.median()}')
print(f'mode: {df_attr.mode()}')
print(f'1 quartile 25%: {df_attr.quantile(q=0.25)}')
print(f'2 quartile 50%: {df_attr.quantile(q=0.50)}')
print(f'3 quartile 75%: {df_attr.quantile(q=0.75)}\n')
print('Histogram = Negatively skewed')
df_attr.hist(figsize=(5,3), bins=100)

In [None]:
print('Attr: X')
df_attr = df_diamonds["x"]
print(f'Max : {df_attr.max()}')
print(f'Min : {df_attr.min()}')
print(f'Variance: {df_attr.var()}')
print(f'Std : {df_attr.std()}')
print(f'mean : {df_attr.mean()}')
print(f'median : {df_attr.median()}')
print(f'mode: {df_attr.mode()}')
print(f'1 quartile 25%: {df_attr.quantile(q=0.25)}')
print(f'2 quartile 50%: {df_attr.quantile(q=0.50)}')
print(f'3 quartile 75%: {df_attr.quantile(q=0.75)}\n')
print('Histogram = Positive skewed')
df_attr.hist(figsize=(5,3))

In [None]:
print('Attr: Y')
df_attr = df_diamonds["y"]
print(f'Max : {df_attr.max()}')
print(f'Min : {df_attr.min()}')
print(f'Variance: {df_attr.var()}')
print(f'Std : {df_attr.std()}')
print(f'mean : {df_attr.mean()}')
print(f'median : {df_attr.median()}')
print(f'mode: {df_attr.mode()}')
print(f'1 quartile 25%: {df_attr.quantile(q=0.25)}')
print(f'2 quartile 50%: {df_attr.quantile(q=0.50)}')
print(f'3 quartile 75%: {df_attr.quantile(q=0.75)}\n')
print('Histogram = Positive skewed')

df_attr.hist(figsize=(5,3), bins=50)

In [None]:
print('Attr: Z')
df_attr = df_diamonds["z"]
print(f'Max : {df_attr.max()}')
print(f'Min : {df_attr.min()}')
print(f'Variance: {df_attr.var()}')
print(f'Std : {df_attr.std()}')
print(f'mean : {df_attr.mean()}')
print(f'median : {df_attr.median()}')
print(f'mode: {df_attr.mode()}')
print(f'1 quartile 25%: {df_attr.quantile(q=0.25)}')
print(f'2 quartile 50%: {df_attr.quantile(q=0.50)}')
print(f'3 quartile 75%: {df_attr.quantile(q=0.75)}\n')
print('Histogram = Positive skewed')

df_attr.hist(figsize=(5,3), bins=100)

### Boxplot

In [None]:
sns.boxplot(x=df_diamonds["carat"])

In [None]:
sns.boxplot(x=df_diamonds["depth"])

In [None]:
sns.boxplot(x=df_diamonds["table"])

In [None]:
sns.boxplot(x=df_diamonds["x"])

In [None]:
sns.boxplot(x=df_diamonds["y"])

In [None]:
sns.boxplot(x=df_diamonds["z"])

### Heatmap

In [None]:
plt.figure(figsize=(7,6))
correlation = df_diamonds.corr()
sns.heatmap(correlation, annot=True)
plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt


usecols = ["carat","depth","table","x","y","z","cut"]
df_diamonds = pd.read_csv('datasets/diamonds.csv',usecols=usecols)
ax = sns.boxplot(data=df_diamonds, orient="h", palette="Set2")

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
usecols = ["carat","depth","table","x","y","z","price"]
df_diamonds = pd.read_csv('datasets/diamonds.csv',usecols=usecols)
df_diamonds.drop(columns={"Unnamed: 0"}, inplace=True)
df_diamonds = scaler.fit_transform(df_diamonds)

In [None]:
ax = sns.boxplot(data=df_diamonds, orient="h", palette="Set2")

## Início Minhas Análises

### Distribuições

In [None]:
order = {
    "cut": ["Ideal","Premium", "Very Good", "Good", "Fair"],
    "clarity": ["IF", "VVS1", "VVS2", "VS1", "VS2", "SI1", "SI2", "I1"],
    "color": ["D","E","F","G","H","I","J"]
}

In [None]:
usecols = ['carat', 'depth', 'table', 'x', 'y', 'z']
nrows, ncols = 2, 3
plt.subplots(figsize=(20,10))
for i, col in enumerate(usecols, 1):
    plt.subplot(nrows, ncols, i)
    sns.histplot(data=df_diamonds[col])

In [None]:
usecols = ['clarity', 'cut', 'color']
nrows, ncols = 1, 3
plt.subplots(figsize=(20,5))
for i, col in enumerate(usecols, 1):
    plt.subplot(nrows, ncols, i)
    sns.countplot(x=col, data=df_diamonds, palette='magma', order=order[col])

In [None]:
plt.subplots(figsize=(10,5))
sns.kdeplot(x=df_diamonds['carat'], data=df_diamonds, hue='color', hue_order=order['color'], 
            palette='bright', multiple="layer")

In [None]:
plt.figure(figsize=(7,6))
correlation = df_diamonds.corr()
sns.heatmap(correlation, annot=True, vmin=-1, vmax=1, cmap="RdBu")
plt.show()

In [None]:
usecols = ['carat', 'depth', 'table']
nrows, ncols = 1, 3
plt.subplots(figsize=(15,4))
for i, col in enumerate(usecols, 1):
    plt.subplot(nrows, ncols, i)
    sns.scatterplot(x=col, y="price", data=df_diamonds, hue="clarity", hue_order=order['clarity'])

In [None]:
usecols = ['carat', 'depth', 'table']
nrows, ncols = 1, 3
plt.subplots(figsize=(15,4))
for i, col in enumerate(usecols, 1):
    plt.subplot(nrows, ncols, i)
    sns.scatterplot(x=col, y="price", data=df_diamonds, hue="cut", hue_order=order['cut'])

In [None]:
usecols = ['carat', 'depth', 'table']
nrows, ncols = 1, 3
plt.subplots(figsize=(15,4))
for i, col in enumerate(usecols, 1):
    plt.subplot(nrows, ncols, i)
    sns.scatterplot(x=col, y="price", data=df_diamonds, hue="color", hue_order=order['color'])

In [None]:
usecols = ['price', 'depth', 'table']
nrows, ncols = 1, 3
plt.subplots(figsize=(15,4))
for i, col in enumerate(usecols, 1):
    plt.subplot(nrows, ncols, i)
    sns.scatterplot(x=col, y="carat", data=df_diamonds, hue="cut", hue_order=order['cut'])

In [None]:
target_feature="price"
nrows, ncols = 2, 3
plt.figure(figsize=(30,9))
for i, row in enumerate(["price","carat"]):
    for j, col in enumerate(["cut","clarity","color"], 1):
        plt.subplot(nrows, ncols, (i*ncols) + j)
        sns.boxplot(data=df_diamonds, x=col, y=row, order=order[col])

#### Carat X Price

In [None]:
sns.scatterplot(x="carat", y="price", hue="clarity", data=df_diamonds[(df_diamonds['clarity']=='SI1') | (df_diamonds['clarity']=='IF') | (df_diamonds['clarity']=='I1')])

In [None]:
sns.scatterplot(x="carat", y="price", hue="cut", data=df_diamonds[df_diamonds["clarity"]=="I1"])

In [None]:
sns.scatterplot(x="carat", y="price", hue="cut", data=df_diamonds[df_diamonds["clarity"]=="IF"])

In [None]:
sns.scatterplot(x="carat", y="price", hue="cut", data=df_diamonds[df_diamonds["clarity"]=="SI2"])

#### Depth X Price

#### Table X Price

In [None]:
sns.scatterplot(x="table", y="depth", hue="cut", data=df_diamonds[df_diamonds['cut']=="Ideal"])

In [None]:
sns.scatterplot(x="x", y="y", hue="cut", data=df_diamonds[df_diamonds['cut']=="Ideal"])

In [None]:
sns.jointplot(data=df_diamonds, x="depth", y="table", hue="cut")

In [None]:
df_diamonds['price_per_carat'] = df_diamonds['price'] / df_diamonds['carat']

In [None]:
diamond=df_diamonds
cut_clarity=pd.pivot_table(diamond, values='price', columns='cut', index='clarity', aggfunc='median')
cut_color=pd.pivot_table(diamond, values='price', columns='cut', index='color', aggfunc='median')
clarity_color=pd.pivot_table(diamond, values='price', columns='clarity', index='color', aggfunc='median')

print("Combination of Color,Cut,Clarity with median price ")
plt.figure(figsize=(15,3))
plt.subplot(131)
sns.heatmap(cut_clarity,cmap='Blues')

plt.subplot(132)
sns.heatmap(cut_color,cmap='Greys')

plt.subplot(133)
sns.heatmap(clarity_color,cmap='Blues')
plt.show()

In [None]:
diamond=df_diamonds
cut_clarity=pd.pivot_table(diamond, values='price_per_carat', columns='cut', index='clarity', aggfunc='median')
cut_color=pd.pivot_table(diamond, values='price_per_carat', columns='cut', index='color', aggfunc='median')
clarity_color=pd.pivot_table(diamond, values='price_per_carat', columns='clarity', index='color', aggfunc='median')

print("Combination of Color,Cut,Clarity with median price per carat ")
plt.figure(figsize=(15,3))
plt.subplot(131)
sns.heatmap(cut_clarity,cmap='Blues')

plt.subplot(132)
sns.heatmap(cut_color,cmap='Blues')

plt.subplot(133)
sns.heatmap(clarity_color,cmap='Blues')
plt.show()

In [None]:
diamond=df_diamonds
cut_clarity=pd.pivot_table(diamond, values='carat', columns='cut', index='clarity', aggfunc='median')
cut_color=pd.pivot_table(diamond, values='carat', columns='cut', index='color', aggfunc='median')
clarity_color=pd.pivot_table(diamond, values='carat', columns='clarity', index='color', aggfunc='median')

print("Combination of Color,Cut,Clarity with median carat ")
plt.figure(figsize=(15,3))
plt.subplot(131)
sns.heatmap(cut_clarity,cmap='Blues')

plt.subplot(132)
sns.heatmap(cut_color,cmap='Greys')

plt.subplot(133)
sns.heatmap(clarity_color,cmap='Blues')
plt.show()

In [None]:
plt.figure(figsize=(10,5))
df_diamonds.corr()['price'].sort_values()[:-1].plot.barh()
plt.title('Order of dependence of price on Numerical Features')
plt.xlabel('Correaltion coefficient with Price')
plt.ylabel('Feature')
plt.show()

In [None]:
target_feature="price"
nrows, ncols = 2, 3
plt.figure(figsize=(30,9))
for i, row in enumerate(["depth","table"]):
    for j, col in enumerate(["cut","clarity","color"], 1):
        plt.subplot(nrows, ncols, (i*ncols) + j)
        sns.boxplot(data=df_diamonds, x=col, y=row, order=order[col])

# Regressão

In [276]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.neural_network import MLPRegressor

In [277]:
df_diamonds = pd.read_csv('datasets/diamonds.csv')
df_diamonds.drop(columns={"Unnamed: 0"}, inplace=True)
df_diamonds["price"] = df_diamonds["price"].astype(float)

In [278]:
df_diamonds['cut']=df_diamonds['cut'].map({'Ideal':5,'Good':4,'Very Good':3,'Premium':2, 'Fair':1})
df_diamonds['color']=df_diamonds['color'].map({'D':7,'E':6,'F':5,'G':4,'H':3,'I':2,'J':1})
df_diamonds['clarity']=df_diamonds['clarity'].map({'IF':8,'VVS1':7,'VVS2':6,'VS1':5,'VS2':4,'SI1':3,'SI2':2, 'I1':1})

In [279]:
ordem = ['carat','y','clarity','color','z','x','depth','table','cut']
sixbest = ['carat','y','clarity','color','z','x']
best = ['carat','y','z','x','depth','table']
X=df_diamonds.drop('price', axis=1)[ordem]
y=df_diamonds['price']

In [280]:
#from sklearn.feature_selection import SelectKBest
#from sklearn.feature_selection import f_regression, mutual_info_regression
#X_new = SelectKBest(mutual_info_regression, k=4).fit_transform(X, y)
#X_new

In [281]:
#from sklearn.feature_selection import SequentialFeatureSelector

#tic_fwd = time()
#sfs_forward = SequentialFeatureSelector(lasso, n_features_to_select=2,
#                                        direction='forward').fit(X, y)
#toc_fwd = time()

#tic_bwd = time()
#sfs_backward = SequentialFeatureSelector(lasso, n_features_to_select=2,
#                                         direction='backward').fit(X, y)

In [282]:
X_train, X_test, y_train, y_test = train_test_split(X , y , test_size=0.2, random_state=1)

In [283]:
sc=MinMaxScaler()
X_train_tx=sc.fit_transform(X_train)
X_test_tx=sc.transform(X_test)

In [284]:
dataset_1=(X_train, X_test, y_train, y_test, 'dataset_1')

In [285]:
# Blank lists for all the details
model_name=[]
model_=[]
cv_score_test=[]
cv_score_train=[]
mse_=[]
mae_=[]
rmse_=[]
r2_=[]

In [286]:
def run_model(model, dataset, modelname):
    model.fit(dataset[0], dataset[2])
    accuracies=cross_val_score(estimator=model, X=dataset[0], y=dataset[2], cv=5, verbose=1, n_jobs=-1)
    y_pred=model.predict(dataset[1])
    print('')
    score_1=model.score(dataset[1], dataset[3])
    print(f'#### {modelname} ####')
    print("score :%.4f" %score_1)
    print(accuracies)
    
    
    mse=mean_squared_error(dataset[3], y_pred)
    mae=mean_absolute_error(dataset[3], y_pred)
    rmse=mean_squared_error(dataset[3], y_pred)**0.5
    r2=r2_score(dataset[3], y_pred)
    
    print('')
    print('MSE    : %0.2f ' % mse)
    print('MAE    : %0.2f ' % mae)
    print('RMSE   : %0.2f ' % rmse)
    print('R2     : %0.2f ' % r2)
    
    ## appending to the lists
    
    model_name.append(modelname)
    model_.append(model)
    cv_score_test.append(score_1)
    cv_score_train.append(np.mean(accuracies))
    mse_.append(mse)
    mae_.append(mae)
    rmse_.append(rmse)
    r2_.append(r2)

In [287]:
model_dict={'DecisionTreeRegressor': DecisionTreeRegressor(),
            'AdaBoostRegressor': AdaBoostRegressor(),
            'GradientBoostingRegressor': GradientBoostingRegressor(),
            'RandomForestRegressor': RandomForestRegressor(),
            'MLPRegressor': MLPRegressor(hidden_layer_sizes=(200, 100,), batch_size=100, learning_rate_init=0.004, learning_rate="adaptive", max_iter=700, verbose=True)
           }

In [288]:
for models in model_dict:
    run_model(model_dict[models], dataset_1, models)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    2.1s finished



#### DecisionTreeRegressor ####
score :0.9656
[0.96576758 0.96354146 0.96431765 0.9657518  0.96432813]

MSE    : 534071.80 
MAE    : 357.85 
RMSE   : 730.80 
R2     : 0.97 


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    6.9s finished



#### AdaBoostRegressor ####
score :0.9249
[0.91365272 0.91010708 0.91847942 0.91409324 0.92529687]

MSE    : 1167701.83 
MAE    : 817.96 
RMSE   : 1080.60 
R2     : 0.92 


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   12.1s finished



#### GradientBoostingRegressor ####
score :0.9758
[0.97367585 0.97543334 0.97790542 0.97435931 0.97626566]

MSE    : 375283.09 
MAE    : 339.81 
RMSE   : 612.60 
R2     : 0.98 


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   42.4s finished



#### RandomForestRegressor ####
score :0.9819
[0.98025077 0.98055135 0.98174137 0.9801239  0.98133041]

MSE    : 281459.08 
MAE    : 263.75 
RMSE   : 530.53 
R2     : 0.98 
Iteration 1, loss = 5031594.61747994
Iteration 2, loss = 712314.87604171
Iteration 3, loss = 552270.54898871
Iteration 4, loss = 509956.75860622
Iteration 5, loss = 487578.92103582
Iteration 6, loss = 471817.69907545
Iteration 7, loss = 459025.32169058
Iteration 8, loss = 439829.61176063
Iteration 9, loss = 426776.94286953
Iteration 10, loss = 416685.38744837
Iteration 11, loss = 408202.52413667
Iteration 12, loss = 400383.18401379
Iteration 13, loss = 398719.26129049
Iteration 14, loss = 387132.94101772
Iteration 15, loss = 388164.74765768
Iteration 16, loss = 379906.28438871
Iteration 17, loss = 379236.24688890
Iteration 18, loss = 370989.24144630
Iteration 19, loss = 366357.86084962
Iteration 20, loss = 365592.94155483
Iteration 21, loss = 363899.50819421
Iteration 22, loss = 358754.05875893
Iteration 23, loss =

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed: 11.8min finished



#### MLPRegressor ####
score :0.9785
[0.97548155 0.96823109 0.97638633 0.9743529  0.9773855 ]

MSE    : 333670.32 
MAE    : 325.80 
RMSE   : 577.64 
R2     : 0.98 


In [289]:
accuracy_data=pd.DataFrame(zip(model_name, cv_score_test, cv_score_train, mse_, mae_, rmse_, r2_), columns=['Model', 'CV Test score', 'CV Train score (mean)', '%%SVGean Squared error', 'Mean Absolute error', 'Root Mean Squared error', 'R2 Score'])

In [290]:
accuracy_data

Unnamed: 0,Model,CV Test score,CV Train score (mean),%%SVGean Squared error,Mean Absolute error,Root Mean Squared error,R2 Score
0,DecisionTreeRegressor,0.965629,0.964741,534071.8,357.846264,730.802165,0.965629
1,AdaBoostRegressor,0.924851,0.916326,1167702.0,817.958892,1080.602533,0.924851
2,GradientBoostingRegressor,0.975848,0.975528,375283.1,339.812399,612.603534,0.975848
3,RandomForestRegressor,0.981886,0.9808,281459.1,263.747287,530.527168,0.981886
4,MLPRegressor,0.978526,0.974367,333670.3,325.803041,577.642036,0.978526


In [298]:
model_[0].get_feature_names

AttributeError: 'DecisionTreeRegressor' object has no attribute 'get_feature_names'