## Diamonds_Price

In [106]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression , Ridge, SGDRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import Pipeline

In [108]:
diamonds_train = pd.read_csv("data/train.csv")

In [109]:
diamonds_train.head(5)

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.5,Premium,F,VS2,61.5,58.0,7.32,7.34,4.51,9.588
1,1,2.01,Very Good,E,SI2,60.6,59.0,8.11,8.25,4.96,9.748
2,2,0.5,Ideal,E,SI1,61.6,57.0,5.13,5.09,3.15,7.255
3,3,0.25,Very Good,F,VVS2,61.6,57.0,4.05,4.08,2.5,6.45
4,4,0.52,Ideal,G,VS2,62.0,55.0,5.16,5.19,3.21,7.721


In [110]:
diamonds_train.shape

(40455, 11)

In [111]:
diamonds_train.isna().sum()

id         0
carat      0
cut        0
color      0
clarity    0
depth      0
table      0
x          0
y          0
z          0
price      0
dtype: int64

In [112]:
diamonds_train.corr()

Unnamed: 0,id,carat,depth,table,x,y,z,price
id,1.0,-0.006196,0.001098,-0.008829,-0.007112,-0.00642,-0.007766,-0.006926
carat,-0.006196,1.0,0.025089,0.18259,0.974516,0.950563,0.94793,0.920878
depth,0.001098,0.025089,1.0,-0.300152,-0.028389,-0.032561,0.091421,-0.000924
table,-0.008829,0.18259,-0.300152,1.0,0.196031,0.184322,0.150142,0.158128
x,-0.007112,0.974516,-0.028389,0.196031,1.0,0.973617,0.965597,0.957653
y,-0.00642,0.950563,-0.032561,0.184322,0.973617,1.0,0.945958,0.9347
z,-0.007766,0.94793,0.091421,0.150142,0.965597,0.945958,1.0,0.930157
price,-0.006926,0.920878,-0.000924,0.158128,0.957653,0.9347,0.930157,1.0


In [113]:
diamonds_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40455 entries, 0 to 40454
Data columns (total 11 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   id       40455 non-null  int64  
 1   carat    40455 non-null  float64
 2   cut      40455 non-null  object 
 3   color    40455 non-null  object 
 4   clarity  40455 non-null  object 
 5   depth    40455 non-null  float64
 6   table    40455 non-null  float64
 7   x        40455 non-null  float64
 8   y        40455 non-null  float64
 9   z        40455 non-null  float64
 10  price    40455 non-null  float64
dtypes: float64(7), int64(1), object(3)
memory usage: 3.4+ MB


Después de comprabar que el dataset estuviera completo y limpio. Lo siguiente que hago es hacer una busqueda exhaustiva sobre el precio del Diamante. 

Las variables o features más influyentes sobre el precio del diamante son las 4C: `CARAT`,`CUT`,`COLOR`,`CLARITY`. 

Por otro lado, la variable que vamos a predecir es el precio. Entonces los modelos que desarollaré son modelos de regrsión. Y la métrica importante que debo sacar es el `mse`(mean square error).

*Nota: Al tener varibales predictoras que tienen una fuerte correlación con la variable precio. Desarrollaré modelos líneales que la variación por pequeña que sea en una de ellas influirá en la variable precio.
En caso de que hubiera sido lo contrario, me hubiera ido por los modelos TreeForest, Randomforest...ect.

In [114]:
diamonds_train_1 = pd.get_dummies(diamonds_train, drop_first=True)

In [115]:
X = diamonds_train_1.drop("price", axis=1)
y = diamonds_train_1["price"]

In [116]:
# Modelo: LinearRegression

X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=666)

pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("linear", LinearRegression())
])

# Standarization & model Training
pipe.fit(X_train, y_train)

# Predicting
y_pred_train = pipe.predict(X_train)
y_pred_test = pipe.predict(X_test)

# Metrics
mse_train = mean_squared_error(y_pred_train,y_train)
mse_test = mean_squared_error(y_pred_test,y_test)

mse_train, mse_test

(0.030241507966882346, 0.04264021461302893)

In [117]:
#r2 LinearRegression
r2_lr_train = r2_score(y_pred_train,y_train)
r2_lr_test = r2_score(y_pred_test,y_test)
r2_lr_train,r2_lr_test

(0.9697133122558786, 0.9577956030181125)

In [118]:
# Modelo: Ridge

X2_train, X2_test, y2_train, y2_test = train_test_split(X,y, random_state=666)

pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("linear", Ridge())
])

# Standarization & model Training
pipe.fit(X2_train, y2_train)

# Predicting
y2_pred_train = pipe.predict(X2_train)
y2_pred_test = pipe.predict(X2_test)

# Metrics
mse2_train = mean_squared_error(y2_pred_train,y2_train)
mse2_test = mean_squared_error(y2_pred_test,y2_test)

mse2_train, mse2_test

(0.0302416484186317, 0.042612593874342714)

In [119]:
#r2 Ridge
r2_rd_train = r2_score(y2_pred_train,y2_train)
r2_rd_test = r2_score(y2_pred_test,y2_test)
r2_rd_train,r2_rd_test

(0.9697084137366769, 0.9578155756209211)

In [120]:
# Modelo: Lasso()

X3_train, X3_test, y3_train, y3_test = train_test_split(X,y, random_state=666)

pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("linear", Lasso())
])

# Standarization & model Training
pipe.fit(X3_train, y3_train)

# Predicting
y3_pred_train = pipe.predict(X3_train)
y3_pred_test = pipe.predict(X3_test)

# Metrics
mse3_train = mean_squared_error(y3_pred_train,y3_train)
mse3_test = mean_squared_error(y3_pred_test,y3_test)

mse3_train, mse3_test

(1.028749770817519, 1.0291314727093757)

In [121]:
#r2 Lasso
r2_ls_train = r2_score(y3_pred_train,y3_train)
r2_ls_test = r2_score(y3_pred_test,y3_test)
r2_ls_train,r2_ls_test

(-3.260238161153704e+29, -8.153619556836261e+28)

In [122]:
# Modelo: SGDRegressor()

X4_train, X4_test, y4_train, y4_test = train_test_split(X,y, random_state=666)

pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("linear", SGDRegressor())
])

# Standarization & model Training
pipe.fit(X4_train, y4_train)

# Predicting
y4_pred_train = pipe.predict(X4_train)
y4_pred_test = pipe.predict(X4_test)

# Metrics
mse4_train = mean_squared_error(y4_pred_train,y4_train)
mse4_test = mean_squared_error(y4_pred_test,y4_test)

mse4_train, mse4_test

(0.03142861911724781, 0.04248486639508702)

In [123]:
#r2 SGDRegressor
r2_sgd_train = r2_score(y4_pred_train,y4_train)
r2_sgd_test = r2_score(y4_pred_test,y4_test)
r2_sgd_train,r2_sgd_test

(0.9664693669443936, 0.9551969409474979)

Conclusión: El mejor modelo de los 4 que he calculado es `LinearRegression` y muy seguido está `Ridge`

## LinearRegression

In [129]:
# Modelo LinearRegression():
scaler = StandardScaler()
X = scaler.fit_transform(X)

modelo = LinearRegression()

modelo.fit(X,y)

y_pred_train = modelo.predict(X)
y_pred_train

array([9.39481495, 9.76373357, 7.24910373, ..., 9.37423225, 9.95501225,
       7.15090856])

In [126]:
#Importo los datos test y lo paso a dummies:

diamonds_test = pd.read_csv("data/test.csv")
X_test = pd.get_dummies(diamonds_test, drop_first=True)

In [127]:
#Pregigo la y de test.
X_test = scaler.fit_transform(X_test)
y_pred_test = modelo.predict(X_test)
y_pred_test

In [130]:
#Calculo el mean square error de train puesto que de test no tengo la y.
mse_train = mean_squared_error(y_pred_train,y)
mse_train

0.033182417346858686

In [151]:
diamonds_test["price"] = y_pred_test

In [153]:
diamonds_test.columns

Index(['id', 'carat', 'cut', 'color', 'clarity', 'depth', 'table', 'x', 'y',
       'z', 'price'],
      dtype='object')

In [155]:
diamonds_price = diamonds_test.drop(columns=['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'x', 'y','z'], axis=1)

In [162]:
diamonds_price.to_csv("diamonds_price.csv",header= True, index = False)