# Big Data 

Diamonds Data

In [2]:
import seaborn as sns

# Data Pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Data preprossing 
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.model_selection import train_test_split

# models
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

# metrics
from sklearn.metrics import mean_absolute_error,mean_absolute_percentage_error,root_mean_squared_error,r2_score,mean_squared_error


In [3]:
df=sns.load_dataset('diamonds')
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [4]:
x=df.drop('price',axis=1)
y=df['price']

In [5]:
numeric_feature=['carat','depth','table','x','y','z']
categoric_feature=['cut','color','clarity']

# train test split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

processor=ColumnTransformer(
    transformers=[
        ('num',StandardScaler(),numeric_feature),
        ('cat',OneHotEncoder(),categoric_feature)
    ])

pipeline=Pipeline(
    steps=[
        ('processor',processor),
        ('model',RandomForestRegressor())
    ])

pipeline.fit(x_train,y_train)
y_pred=pipeline.predict(x_test)

print('MSE: ',mean_squared_error(y_test,y_pred))
print('r2_score: ',r2_score(y_test,y_pred))
print('MAE: ',mean_absolute_error(y_test,y_pred))
print('MAPE: ',mean_absolute_percentage_error(y_test,y_pred))
print('RMSE: ',root_mean_squared_error(y_test,y_pred))


MSE:  302912.0102705799
r2_score:  0.9809451209350672
MAE:  270.55287626132207
MAPE:  0.0648913261405697
RMSE:  550.3744273406786


# LinearRegression
- MSE:  1288705.4778516763
- r2_score:  0.9189331350419386
- MAE:  737.1513665933285
- MAPE:  0.3952933516494362
- RMSE:  1135.2116445190634

# DecisionTreeRegressor
- MSE:  544063.468970152
- r2_score:  0.9657753299527031
- MAE:  356.902808676307
- MAPE:  0.085626470477801
- RMSE:  737.6065814308818

# RandomForestRegressor
- MSE:  304919.0996561165
- r2_score:  0.9808188636583097
- MAE:  270.92070295478214
- MAPE:  0.06516486847829657
- RMSE:  552.1948022719125