In [112]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.pipeline import Pipeline

from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor, AdaBoostRegressor
from sklearn.neural_network import MLPRegressor  # For MLPRegressor
from sklearn.decomposition import PCA

from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [113]:
df = pd.read_csv("Diamond Price Prediction.csv")
df.head()

Unnamed: 0,Carat(Weight of Daimond),Cut(Quality),Color,Clarity,Depth,Table,Price(in US dollars),X(length),Y(width),Z(Depth)
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [114]:
# Convert the column to float64
df["Price(in US dollars)"] = df["Price(in US dollars)"].astype(float)

# Verify the data type
print(df["Price(in US dollars)"].dtypes)


float64


In [115]:
df.isnull().sum()

Carat(Weight of Daimond)    0
Cut(Quality)                0
Color                       0
Clarity                     0
Depth                       0
Table                       0
Price(in US dollars)        0
X(length)                   0
Y(width)                    0
Z(Depth)                    0
dtype: int64

In [116]:
df.shape

(53940, 10)

In [117]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53940 entries, 0 to 53939
Data columns (total 10 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Carat(Weight of Daimond)  53940 non-null  float64
 1   Cut(Quality)              53940 non-null  object 
 2   Color                     53940 non-null  object 
 3   Clarity                   53940 non-null  object 
 4   Depth                     53940 non-null  float64
 5   Table                     53940 non-null  float64
 6   Price(in US dollars)      53940 non-null  float64
 7   X(length)                 53940 non-null  float64
 8   Y(width)                  53940 non-null  float64
 9   Z(Depth)                  53940 non-null  float64
dtypes: float64(7), object(3)
memory usage: 4.1+ MB


In [118]:
df["Price(in US dollars)"].max()

18823.0

In [119]:
print(df["Price(in US dollars)"].min())

326.0


In [120]:
X = df.drop(columns=["Price(in US dollars)"])
y = df["Price(in US dollars)"]

In [121]:
X

Unnamed: 0,Carat(Weight of Daimond),Cut(Quality),Color,Clarity,Depth,Table,X(length),Y(width),Z(Depth)
0,0.23,Ideal,E,SI2,61.5,55.0,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,4.20,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,4.34,4.35,2.75
...,...,...,...,...,...,...,...,...,...
53935,0.72,Ideal,D,SI1,60.8,57.0,5.75,5.76,3.50
53936,0.72,Good,D,SI1,63.1,55.0,5.69,5.75,3.61
53937,0.70,Very Good,D,SI1,62.8,60.0,5.66,5.68,3.56
53938,0.86,Premium,H,SI2,61.0,58.0,6.15,6.12,3.74


In [124]:
X["Table"].min()

43.0

In [125]:
X["Table"].max()

95.0

In [26]:
y

0         326.0
1         326.0
2         327.0
3         334.0
4         335.0
          ...  
53935    2757.0
53936    2757.0
53937    2757.0
53938    2757.0
53939    2757.0
Name: Price(in US dollars), Length: 53940, dtype: float64

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53940 entries, 0 to 53939
Data columns (total 27 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Carat(Weight of Daimond)  53940 non-null  float64
 1   Depth                     53940 non-null  float64
 2   Table                     53940 non-null  float64
 3   Price(in US dollars)      53940 non-null  float64
 4   X(length)                 53940 non-null  float64
 5   Y(width)                  53940 non-null  float64
 6   Z(Depth)                  53940 non-null  float64
 7   Cut(Quality)_Fair         53940 non-null  float64
 8   Cut(Quality)_Good         53940 non-null  float64
 9   Cut(Quality)_Ideal        53940 non-null  float64
 10  Cut(Quality)_Premium      53940 non-null  float64
 11  Cut(Quality)_Very Good    53940 non-null  float64
 12  Color_D                   53940 non-null  float64
 13  Color_E                   53940 non-null  float64
 14  Color_

In [27]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['Carat(Weight of Daimond)', 'Depth', 'Table', 'X(length)', 'Y(width)', 'Z(Depth)']),
        ('cat', OneHotEncoder(drop='first'), ['Cut(Quality)','Color','Clarity'])
    ], 
    remainder='passthrough'
)

In [28]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('lr', LinearRegression())
])

In [29]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y, cv=kfold, scoring='r2')

In [30]:
scores.mean(), scores.std()

(0.919590687273826, 0.0034279946080342204)

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=.25)

In [32]:
X_train.shape , X_test.shape

((40455, 9), (13485, 9))

In [33]:
pipeline.fit(X_train,y_train)

In [35]:
y_pred_lr = pipeline.predict(X_test)

In [37]:
print("r2", r2_score(y_test, y_pred_lr))
print('mae',mean_absolute_error(y_test, y_pred_lr))
print('mse', mean_squared_error(y_test, y_pred_lr))

r2 0.9195675153810781
mae 735.7197347338503
mse 1263569.5480387926


In [40]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = y_pred
    
    output.append(mean_absolute_error(y_test,y_pred))
    
    return output
    

In [41]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor()
}

In [42]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))



In [44]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [45]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
6,extra trees,0.980179,278.198108
5,random forest,0.976392,293.676137
9,mlp,0.963813,385.491573
4,decision tree,0.956209,386.427466
7,gradient boosting,0.952968,437.195596
3,LASSO,0.919353,732.429655
2,ridge,0.91959,736.711816
0,linear_reg,0.919591,737.151367
8,adaboost,0.853401,906.273348
1,svr,0.538092,1316.230755


### Target Encoding On Clarity Column

In [None]:
!pip install category_encoders

In [47]:
import category_encoders as ce
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['Carat(Weight of Daimond)', 'Depth', 'Table', 'X(length)', 'Y(width)', 'Z(Depth)']),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False),['Cut(Quality)','Color']),
        ('target_enc', ce.TargetEncoder(), ['Clarity'])
    ], 
    remainder='passthrough'
)

In [48]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('etr', ExtraTreesRegressor())
])

In [49]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y, cv=kfold, scoring='r2')

In [50]:
scores.mean(),scores.std()

(0.979379511638712, 0.0019203097918895238)

In [52]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    output.append(mean_absolute_error(y_test,y_pred))
    
    return output
    

In [51]:
model_dict = {
    'linear_reg':LinearRegression(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor()
}

In [53]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))



In [54]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [55]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
2,random forest,0.981131,275.415651
3,extra trees,0.979588,276.844164
1,decision tree,0.965025,360.123007
4,gradient boosting,0.965542,398.077863
6,mlp,0.912864,694.506566
0,linear_reg,0.900906,781.80968
5,adaboost,0.897229,882.821684


### Hyperparameter Tuning

In [56]:
from sklearn.model_selection import GridSearchCV

In [66]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['Carat(Weight of Daimond)', 'Depth', 'Table', 'X(length)', 'Y(width)', 'Z(Depth)']),
        ('cat1', OneHotEncoder(drop='first', sparse_output=False), ['Cut(Quality)', 'Color']),
        ('target_enc', ce.TargetEncoder(), ['Clarity'])
    ],
    remainder='passthrough'
)

In [67]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor())  # Instantiated
])

In [68]:
param_grid = {
    'regressor__n_estimators': [50, 100, 200, 300],
    'regressor__max_depth': [None, 10, 20, 30],
    'regressor__max_samples': [0.1, 0.25, 0.5, 1.0],
    'regressor__max_features': ['auto', 'sqrt']
}

In [69]:
# K-Fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
search = GridSearchCV(pipeline, param_grid, cv=kfold, scoring='r2', n_jobs=-1, verbose=4)


In [70]:
# Fit the model
search.fit(X, y)

Fitting 10 folds for each of 128 candidates, totalling 1280 fits
[CV 2/10] END regressor__max_depth=None, regressor__max_features=auto, regressor__max_samples=0.1, regressor__n_estimators=50;, score=nan total time=   0.1s
[CV 1/10] END regressor__max_depth=None, regressor__max_features=auto, regressor__max_samples=0.1, regressor__n_estimators=50;, score=nan total time=   0.1s
[CV 3/10] END regressor__max_depth=None, regressor__max_features=auto, regressor__max_samples=0.1, regressor__n_estimators=50;, score=nan total time=   0.1s
[CV 5/10] END regressor__max_depth=None, regressor__max_features=auto, regressor__max_samples=0.1, regressor__n_estimators=50;, score=nan total time=   0.1s
[CV 7/10] END regressor__max_depth=None, regressor__max_features=auto, regressor__max_samples=0.1, regressor__n_estimators=50;, score=nan total time=   0.1s
[CV 4/10] END regressor__max_depth=None, regressor__max_features=auto, regressor__max_samples=0.1, regressor__n_estimators=50;, score=nan total time= 



[CV 2/10] END regressor__max_depth=None, regressor__max_features=sqrt, regressor__max_samples=0.25, regressor__n_estimators=300;, score=0.972 total time=   6.9s
[CV 3/10] END regressor__max_depth=None, regressor__max_features=sqrt, regressor__max_samples=0.25, regressor__n_estimators=300;, score=0.974 total time=   7.0s
[CV 4/10] END regressor__max_depth=None, regressor__max_features=sqrt, regressor__max_samples=0.25, regressor__n_estimators=300;, score=0.971 total time=   7.0s
[CV 5/10] END regressor__max_depth=None, regressor__max_features=sqrt, regressor__max_samples=0.25, regressor__n_estimators=300;, score=0.968 total time=   7.2s
[CV 6/10] END regressor__max_depth=None, regressor__max_features=sqrt, regressor__max_samples=0.25, regressor__n_estimators=300;, score=0.969 total time=   7.0s
[CV 7/10] END regressor__max_depth=None, regressor__max_features=sqrt, regressor__max_samples=0.25, regressor__n_estimators=300;, score=0.970 total time=   7.0s
[CV 8/10] END regressor__max_depth

640 fits failed out of a total of 1280.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
243 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/rajchandravanshi/Desktop/Projects/venv/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/rajchandravanshi/Desktop/Projects/venv/lib/python3.10/site-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Users/rajchandravanshi/Desktop/Projects/venv/lib/python3.10/site-packages/sklearn/pipeline.py", line 473, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "/Users/rajcha

In [71]:
print("Best Parameters:", search.best_params_)
print("Best Score:", search.best_score_)

Best Parameters: {'regressor__max_depth': None, 'regressor__max_features': 'sqrt', 'regressor__max_samples': 1.0, 'regressor__n_estimators': 300}
Best Score: 0.976663678246949


### Exporting the model

In [126]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['Carat(Weight of Daimond)', 'Depth', 'Table', 'X(length)', 'Y(width)', 'Z(Depth)']),
        ('cat1', OneHotEncoder(drop='first', sparse_output=False), ['Cut(Quality)', 'Color']),
        ('target_enc', ce.TargetEncoder(), ['Clarity'])
    ],
    remainder='passthrough'
)

In [127]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=300, random_state=42))  # Instantiated
])

In [128]:
pipeline.fit(X,y)

In [129]:
y_pred_rf = pipeline.predict(X_test)

In [130]:
r2_score(y_test, y_pred_rf)

0.9974719860513086

In [108]:
# K-Fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y, cv=kfold, scoring='r2')


In [109]:
scores.mean(), scores.std()

(0.981145247781353, 0.0009273769335119094)

In [131]:
import pickle

with open('pipeline2.pkl', 'wb') as file:
    pickle.dump(pipeline, file)

In [111]:
with open('df.pkl', 'wb') as file:
    pickle.dump(X, file)

In [133]:
pipeline_etr = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', ExtraTreesRegressor(n_estimators=500, random_state=42))  # Instantiated
])

In [None]:
pipeline_etr.fit(X,y)

In [134]:
# K-Fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
search = cross_val_score(pipeline_etr, X, y, cv=kfold, scoring='r2')

In [135]:
search.mean() , search.std()

(0.9796341777287811, 0.0018275140706172574)

In [136]:
from sklearn.ensemble import StackingRegressor

In [137]:
base_reg = [
    ('etr', ExtraTreesRegressor(n_estimators=300, random_state=42)),
    ('gb', GradientBoostingRegressor(random_state=42, n_estimators=100, learning_rate=0.3)),
    ('mlp', MLPRegressor())
]
meta_reg = RandomForestRegressor(n_estimators=500, random_state=42)

In [138]:
pipeline_final = Pipeline([
    ('preprocessor', preprocessor),
    ('stacking_rg',StackingRegressor(
    estimators= base_reg,
    final_estimator= meta_reg,
    cv=5  # Cross-validation for the meta-classifier
))
])

In [139]:
pipeline_final.fit(X_train, y_train)



In [140]:
y_pred_rf = pipeline_final.predict(X_test)

In [141]:
r2_score(y_test, y_pred_rf)

0.9793378908957393

In [None]:
# Define K-Fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)

# Evaluate pipeline using cross-validation
scores = cross_val_score(pipeline_final, X, y, cv=kfold, scoring='r2', n_jobs=-1)

# Output the results
print("Cross-Validation R² Scores:", scores)
print("Mean R² Score:", scores.mean())
print("Standard Deviation of R² Scores:", scores.std())