In [76]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
import plotly.express as px
import warnings
warnings.filterwarnings('ignore')

In [77]:
try:
    df=sns.load_dataset('diamonds')
    print('\nData Loaded Successfully\n')
except:
    print('Dataset Not Found')


Data Loaded Successfully



In [78]:
df

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75
...,...,...,...,...,...,...,...,...,...,...
53935,0.72,Ideal,D,SI1,60.8,57.0,2757,5.75,5.76,3.50
53936,0.72,Good,D,SI1,63.1,55.0,2757,5.69,5.75,3.61
53937,0.70,Very Good,D,SI1,62.8,60.0,2757,5.66,5.68,3.56
53938,0.86,Premium,H,SI2,61.0,58.0,2757,6.15,6.12,3.74


In [79]:
df.isnull().sum().sort_values(ascending=False)

carat      0
cut        0
color      0
clarity    0
depth      0
table      0
price      0
x          0
y          0
z          0
dtype: int64

In [80]:
fig_1=px.box(df,x='price')
fig_1.show()

In [81]:
fig_2=px.violin(df,x='x')
fig_2.show()

In [82]:
fig_2=px.violin(df,x='y')
fig_2.show()

In [83]:
fig_2=px.area(df,x='y')
fig_2.show()

In [91]:
from sklearn.compose import make_column_selector as selector,ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder,RobustScaler,OneHotEncoder
from sklearn.pipeline import Pipeline

cat=Pipeline([
    ('encoder',OneHotEncoder(handle_unknown='ignore'))
])

num=Pipeline([
    ('scaler',RobustScaler())
])

preprocessor=ColumnTransformer([
    ('cats',cat,selector(dtype_include=['object','category'])),
    ('nums',num,selector(dtype_include=['number']))
])


In [92]:
X=df.drop('price',axis=1)
y=df['price']

In [93]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

X_train_processed=preprocessor.fit_transform(X_train)
X_test_processed=preprocessor.transform(X_test)

In [94]:
from catboost import CatBoostRegressor
from sklearn.pipeline import Pipeline

model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', CatBoostRegressor(
        iterations=100,
        depth=6,
        learning_rate=0.1,
        loss_function='RMSE',
        verbose=0
    ))
])

param_grid = {
    'classifier__iterations': [100, 200],
    'classifier__learning_rate': [0.01, 0.1],
    'classifier__depth': [4, 6, 8],
    'classifier__l2_leaf_reg': [1, 3, 5]
}


In [None]:
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV,_search_successive_halving
from sklearn.metrics import root_mean_squared_error

cv = RandomizedSearchCV(
    estimator=model,
    scoring='neg_mean_squared_error',  
    param_distributions=param_grid,   
    cv=5,
    n_jobs=3,
    error_score='raise',
    n_iter=10,  
    random_state=42
)

In [97]:
cv.fit(X_train,y_train)

y_pred=cv.predict(X_test)


In [98]:
from sklearn.metrics import mean_absolute_percentage_error,mean_squared_error,mean_absolute_error

print('Root_Mean_Squared_Error',root_mean_squared_error(y_test,y_pred))
print('Mean_Squared_Error',mean_squared_error(y_test,y_pred))
print('Mean_Absolute_Percentage_Error',mean_absolute_percentage_error(y_test,y_pred))
print('Mean_Absolute_Error',mean_absolute_error(y_test,y_pred))






Root_Mean_Squared_Error 541.8492942987211
Mean_Squared_Error 293600.6577320221
Mean_Absolute_Percentage_Error 0.08296803155817788
Mean_Absolute_Error 288.41493572779893
