In [9]:
import pandas as pd 
import numpy as np
import pickle
import json

In [10]:
df = pd.read_csv("full_data.csv")
df.head()

Unnamed: 0,type,total_price,carat,price_per_carat,color,shape,length,width,height,clarity,cut,color_intensity,origin,treatment,cut_quality
0,Blue Sapphire,300,0.96,313.0,Blue,Cushion,8.17,5.26,2.49,Very Slightly Included,Mixed Brilliant,Intense,Nigeria,No Enhancement,Fair
1,Blue Sapphire,300,0.75,400.0,Blue,Oval,5.74,4.47,3.73,Very Slightly Included,Emerald Cut,Vivid,Ceylon (Sri Lanka),Heated,Fair
2,Blue Sapphire,300,0.75,400.0,Blue,Emerald Cut,6.0,4.33,2.87,Slightly Included,Mixed Brilliant,Medium Intense,Ceylon (Sri Lanka),No Enhancement,Good
3,Blue Sapphire,300,0.6,500.0,Blue,Asscher - Octagon,4.97,4.9,2.78,Very Slightly Included,Asscher,Vivid,Ceylon (Sri Lanka),Heated,Fair
4,Blue Sapphire,300,0.6,500.0,Blue,Asscher - Octagon,4.78,4.77,2.97,Very Slightly Included,Asscher,Intense,Ceylon (Sri Lanka),Heated,Fair


In [11]:
# Split data into training and testing data
from sklearn.model_selection import train_test_split

data = df.drop(['length', 'width', 'height'], axis='columns')
X = df[['carat','price_per_carat', 'color', 'shape', 'clarity', 'cut', 'color_intensity', 'origin', 'treatment','cut_quality','type']]
y = df['total_price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=True)

In [12]:
from sklearn.preprocessing import OrdinalEncoder
oe = OrdinalEncoder(categories=[['Poor', 'Fair', 'Good','Very Good','Excellent']])
X_train['cut_quality_encoded'] = oe.fit_transform(X_train[['cut_quality']])
X_test['cut_quality_encoded'] = oe.transform(X_test[['cut_quality']])

# Handle target
type_mean = X_train.groupby('type')['price_per_carat'].mean()
shape_means = X_train.groupby('shape')['price_per_carat'].mean()
origin_means = X_train.groupby('origin')['price_per_carat'].mean()
color_means = X_train.groupby('color')['price_per_carat'].mean()
color_intensity_means = X_train.groupby('color_intensity')['price_per_carat'].mean()
clarity_means = X_train.groupby('clarity')['price_per_carat'].mean()
treatment_means = X_train.groupby('treatment')['price_per_carat'].mean()
cut_means = X_train.groupby('cut')['price_per_carat'].mean()

# Map target encoding to the training data
X_train['shape_encoded'] = X_train['shape'].map(shape_means)
X_train['origin_encoded'] = X_train['origin'].map(origin_means)
X_train['color_encoded'] = X_train['color'].map(color_means)
X_train['color_intensity_encoded'] = X_train['color_intensity'].map(color_intensity_means)
X_train['clarity_encoded'] = X_train['clarity'].map(clarity_means)
X_train['cut_encoded'] = X_train['cut'].map(cut_means)
X_train['treatment_encoded'] = X_train['treatment'].map(treatment_means)
X_train['type_encoded'] = X_train['type'].map(type_mean)

# Map target encoding to the test data
# Handle unseen categories by filling with global mean
global_mean = X_train['price_per_carat'].mean()
X_test['shape_encoded'] = X_test['shape'].map(shape_means).fillna(global_mean)
X_test['origin_encoded'] = X_test['origin'].map(origin_means).fillna(global_mean)
X_test['color_encoded'] = X_test['color'].map(color_means).fillna(global_mean)
X_test['color_intensity_encoded'] = X_test['color_intensity'].map(color_intensity_means).fillna(global_mean)
X_test['clarity_encoded'] = X_test['clarity'].map(clarity_means).fillna(global_mean)
X_test['cut_encoded'] = X_test['cut'].map(cut_means).fillna(global_mean)
X_test['treatment_encoded'] = X_test['treatment'].map(treatment_means).fillna(global_mean)
X_test['type_encoded'] = X_test['type'].map(type_mean).fillna(global_mean)

X_train = X_train.drop(columns=['price_per_carat', 'color', 'shape', 'color_intensity', 'origin', 'cut', 'treatment', 'clarity','cut_quality','type'])
X_test = X_test.drop(columns=['price_per_carat', 'color', 'shape', 'color_intensity', 'origin', 'cut', 'treatment', 'clarity','cut_quality','type'])
print(X_train.dtypes)

carat                      float64
cut_quality_encoded        float64
shape_encoded              float64
origin_encoded             float64
color_encoded              float64
color_intensity_encoded    float64
clarity_encoded            float64
cut_encoded                float64
treatment_encoded          float64
type_encoded               float64
dtype: object


In [13]:
# OrdinalEncoder mapping
cut_quality_mapping = {category: idx for idx, category in enumerate(oe.categories_[0])}
print(cut_quality_mapping)

# Store target encoding mappings
shape_mapping = shape_means.to_dict()
origin_mapping = origin_means.to_dict()
color_mapping = color_means.to_dict()
color_intensity_mapping = color_intensity_means.to_dict()
clarity_mapping = clarity_means.to_dict()
treatment_mapping = treatment_means.to_dict()
cut_mapping = cut_means.to_dict()
type_mapping = type_mean.to_dict()

# Print one mapping as an example
print("Shape Mapping:", shape_mapping)

# Store global mean
global_mean_value = global_mean
print("Global Mean:", global_mean_value)

# Combine all mappings into a dictionary
all_mappings = {
    "cut_quality_mapping": cut_quality_mapping,
    "shape_mapping": shape_mapping,
    "origin_mapping": origin_mapping,
    "color_mapping": color_mapping,
    "color_intensity_mapping": color_intensity_mapping,
    "clarity_mapping": clarity_mapping,
    "treatment_mapping": treatment_mapping,
    "cut_mapping": cut_mapping,
    "type_mapping": type_mapping,
    "global_mean": global_mean_value,
}

# Save as a JSON file
with open("encoded_features.json", "w") as file:
    json.dump(all_mappings, file, indent=4)


{'Poor': 0, 'Fair': 1, 'Good': 2, 'Very Good': 3, 'Excellent': 4}
Shape Mapping: {'Asscher - Octagon': 1926.2276000000002, 'Cushion': 2776.0080550458715, 'Emerald Cut': 2085.432578125, 'Fancy': 1275.3796296296296, 'Heart': 2952.6014835164833, 'Marquise': 2341.0526315789475, 'Oval': 2482.8079529529527, 'Pear': 2304.0555650684933, 'Princess': 1348.6153846153845, 'Radiant': 1483.020996978852, 'Round': 2118.0156686290998, 'Trillion': 1054.6752136752136}
Global Mean: 2335.5366704130374


In [14]:
from xgboost import XGBRegressor
import xgboost
from sklearn.metrics import *
model = XGBRegressor(
    n_estimators=20,
    random_state=42,
    learning_rate=0.4
    )

model.fit(X_train, y_train)
y_predict = model.predict(X_test)
y_predict_train = model.predict(X_train)

# Predicting the accuracy score for Gradient Booster
score=r2_score(y_test, y_predict)
score_train = r2_score(y_train, y_predict_train)
print('-------Gradient Booster-------')
print('r2_score is', score)
print('r2_score for training is', score_train)
print('mean_squared_error is==', mean_squared_error(y_test,y_predict))
print('root_mean_squared_error is==',np.sqrt(mean_squared_error(y_test, y_predict)))

-------Gradient Booster-------
r2_score is 0.8973613064869609
r2_score for training is 0.9667103724790951
mean_squared_error is== 16973274.7744309
root_mean_squared_error is== 4119.863441235753


In [15]:
# Save the trained model to a pickle file
with open("xgb_price_predict_model.pkl", "wb") as f:
    pickle.dump(model, f)

print("Model saved successfully!")


Model saved successfully!


In [16]:
from xgboost import XGBRegressor
import xgboost
from sklearn.metrics import *
import numpy as np

# Log transform the target variable
y_train_sqrt = np.sqrt(y_train)

model = XGBRegressor(
    n_estimators=20,
    random_state=42,
    learning_rate=0.4
)

model.fit(X_train, y_train_sqrt)

# Predict and reverse the square root transformation
y_predict_sqrt = model.predict(X_test)
y_predict = np.square(y_predict_sqrt)

# Evaluate the model
score = r2_score(y_test, y_predict)
score_train = r2_score(y_train, np.square(model.predict(X_train)))
print('-------Gradient Booster-------')
print('r2_score is', score)
print('r2_score for training is', score_train)
print('mean_squared_error is==', mean_squared_error(y_test, y_predict))
print('root_mean_squared_error is==', np.sqrt(mean_squared_error(y_test, y_predict)))

# Save the trained model to a pickle file
with open("xgb_price_predict_model.pkl", "wb") as f:
    pickle.dump(model, f)

print("Model saved successfully!")

-------Gradient Booster-------
r2_score is 0.9069142515687254
r2_score for training is 0.966839810970065
mean_squared_error is== 15393512.247959943
root_mean_squared_error is== 3923.4566708401335
Model saved successfully!
