In [29]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
df=sns.load_dataset('diamonds')
df

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75
...,...,...,...,...,...,...,...,...,...,...
53935,0.72,Ideal,D,SI1,60.8,57.0,2757,5.75,5.76,3.50
53936,0.72,Good,D,SI1,63.1,55.0,2757,5.69,5.75,3.61
53937,0.70,Very Good,D,SI1,62.8,60.0,2757,5.66,5.68,3.56
53938,0.86,Premium,H,SI2,61.0,58.0,2757,6.15,6.12,3.74


In [30]:
df.isnull().sum().sort_values(ascending=False)

carat      0
cut        0
color      0
clarity    0
depth      0
table      0
price      0
x          0
y          0
z          0
dtype: int64

In [31]:
df.select_dtypes(include=['object','category']).columns

Index(['cut', 'color', 'clarity'], dtype='object')

In [None]:
df.select_dtypes(include=[int,float]).columns 

Index(['carat', 'depth', 'table', 'price', 'x', 'y', 'z'], dtype='object')

In [33]:
from sklearn.preprocessing import LabelEncoder,OneHotEncoder,StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split,GridSearchCV

In [34]:
categoric_cols=['cut', 'color', 'clarity']
numeric_cols=['carat', 'depth', 'table', 'x', 'y','z']

preprocessor=ColumnTransformer(
    transformers=[
        ('num',StandardScaler(),numeric_cols),
        ('cat',OneHotEncoder(),categoric_cols)]
    
)


In [35]:
import warnings
warnings.filterwarnings('ignore')
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression,HuberRegressor,LinearRegression
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor
from lightgbm import LGBMRegressor



In [None]:
X = df.drop('price', axis=1)
y = df['price']
pipeline=Pipeline([
    ('preprocessor',preprocessor),
    ('model',GradientBoostingRegressor())
])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

param_grid={
    'model__n_estimators': [200],
    'model__learning_rate': [0.05],
    'model__max_depth': [5],
    'model__subsample': [0.8],
    'model__min_samples_split': [2],
    'model__min_samples_leaf': [1]
}

grid_search = GridSearchCV(pipeline,
                            param_grid, 
                            cv=5, 
                            scoring='neg_mean_squared_error')

grid_search.fit(X, y)

print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)
rmse = np.sqrt(-grid_search.best_score_)
print('RMSE:', rmse)


Best Parameters: {'model__learning_rate': 0.05, 'model__max_depth': 5, 'model__min_samples_leaf': 1, 'model__min_samples_split': 2, 'model__n_estimators': 200, 'model__subsample': 0.8}
Best Score: -3238579.9144031424
RMSE: 1799.6054885455153


In [42]:
from sklearn.metrics import mean_absolute_error,root_mean_squared_error,mean_absolute_percentage_error,mean_squared_error

In [46]:
y_pred = grid_search.best_estimator_.predict(X)
print('MSE', mean_squared_error(y, y_pred))
print('MAE', mean_absolute_error(y, y_pred))

MSE 283608.7106103539
MAE 296.0478350531038
