In [26]:
import pandas as pd 
import numpy as np
import pickle
import json

In [27]:
df = pd.read_csv("full_data_cleaned.csv")
df.head()

Unnamed: 0,type,total_price,carat,price_per_carat,color,shape,length,width,height,clarity,cut,color_intensity,origin,treatment,cut_quality
0,Blue Sapphire,100,0.5,200.0,Blue,Oval,5.77,4.53,2.23,Very Slightly Included,Mixed Brilliant,Very Light,Ceylon (Sri Lanka),No Enhancement,Fair
1,Pink Sapphire,100,0.2,500.0,Purplish Pink,Round,3.67,3.68,1.97,Very Slightly Included,Mixed Brilliant,Medium,Ceylon (Sri Lanka),No Enhancement,Fair
2,Pink Sapphire,100,0.25,400.0,Pinkish Purple,Round,3.39,3.34,2.79,Very Slightly Included,Mixed Brilliant,Intense,Ceylon (Sri Lanka),No Enhancement,Fair
3,Pink Sapphire,100,0.25,400.0,Pink,Oval,4.21,3.23,2.35,Very Slightly Included,Mixed Brilliant,Light,Ceylon (Sri Lanka),No Enhancement,Good
4,Pink Sapphire,100,0.25,400.0,Pink,Oval,3.99,3.19,2.15,Slightly Included,Mixed Brilliant,Medium Light,Ceylon (Sri Lanka),No Enhancement,Good


In [28]:
# Split data into training and testing data
from sklearn.model_selection import train_test_split

data = df.drop(['length', 'width', 'height'], axis='columns')
X = df[['carat','price_per_carat', 'color', 'shape', 'clarity', 'cut', 'color_intensity', 'origin', 'treatment','cut_quality','type']]
y = df['total_price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=True)

In [29]:
from sklearn.preprocessing import OrdinalEncoder
oe = OrdinalEncoder(categories=[['Poor', 'Fair', 'Good','Very Good','Excellent']])
X_train['cut_quality_encoded'] = oe.fit_transform(X_train[['cut_quality']])
X_test['cut_quality_encoded'] = oe.transform(X_test[['cut_quality']])

# Handle target
type_mean = X_train.groupby('type')['price_per_carat'].mean()
shape_means = X_train.groupby('shape')['price_per_carat'].mean()
origin_means = X_train.groupby('origin')['price_per_carat'].mean()
color_means = X_train.groupby('color')['price_per_carat'].mean()
color_intensity_means = X_train.groupby('color_intensity')['price_per_carat'].mean()
clarity_means = X_train.groupby('clarity')['price_per_carat'].mean()
treatment_means = X_train.groupby('treatment')['price_per_carat'].mean()
cut_means = X_train.groupby('cut')['price_per_carat'].mean()

# Map target encoding to the training data
X_train['shape_encoded'] = X_train['shape'].map(shape_means)
X_train['origin_encoded'] = X_train['origin'].map(origin_means)
X_train['color_encoded'] = X_train['color'].map(color_means)
X_train['color_intensity_encoded'] = X_train['color_intensity'].map(color_intensity_means)
X_train['clarity_encoded'] = X_train['clarity'].map(clarity_means)
X_train['cut_encoded'] = X_train['cut'].map(cut_means)
X_train['treatment_encoded'] = X_train['treatment'].map(treatment_means)
X_train['type_encoded'] = X_train['type'].map(type_mean)

# Map target encoding to the test data
# Handle unseen categories by filling with global mean
global_mean = X_train['price_per_carat'].mean()
X_test['shape_encoded'] = X_test['shape'].map(shape_means).fillna(global_mean)
X_test['origin_encoded'] = X_test['origin'].map(origin_means).fillna(global_mean)
X_test['color_encoded'] = X_test['color'].map(color_means).fillna(global_mean)
X_test['color_intensity_encoded'] = X_test['color_intensity'].map(color_intensity_means).fillna(global_mean)
X_test['clarity_encoded'] = X_test['clarity'].map(clarity_means).fillna(global_mean)
X_test['cut_encoded'] = X_test['cut'].map(cut_means).fillna(global_mean)
X_test['treatment_encoded'] = X_test['treatment'].map(treatment_means).fillna(global_mean)
X_test['type_encoded'] = X_test['type'].map(type_mean).fillna(global_mean)

X_train = X_train.drop(columns=['price_per_carat', 'color', 'shape', 'color_intensity', 'origin', 'cut', 'treatment', 'clarity','cut_quality','type'])
X_test = X_test.drop(columns=['price_per_carat', 'color', 'shape', 'color_intensity', 'origin', 'cut', 'treatment', 'clarity','cut_quality','type'])
print(X_train.dtypes)

carat                      float64
cut_quality_encoded        float64
shape_encoded              float64
origin_encoded             float64
color_encoded              float64
color_intensity_encoded    float64
clarity_encoded            float64
cut_encoded                float64
treatment_encoded          float64
type_encoded               float64
dtype: object


In [30]:
# OrdinalEncoder mapping
cut_quality_mapping = {category: idx for idx, category in enumerate(oe.categories_[0])}
print(cut_quality_mapping)

# Store target encoding mappings
shape_mapping = shape_means.to_dict()
origin_mapping = origin_means.to_dict()
color_mapping = color_means.to_dict()
color_intensity_mapping = color_intensity_means.to_dict()
clarity_mapping = clarity_means.to_dict()
treatment_mapping = treatment_means.to_dict()
cut_mapping = cut_means.to_dict()
type_mapping = type_mean.to_dict()

# Print one mapping as an example
print("Shape Mapping:", shape_mapping)
print("Origin Mapping:", origin_mapping)

# Store global mean
global_mean_value = global_mean
print("Global Mean:", global_mean_value)

# Combine all mappings into a dictionary
all_mappings = {
    "cut_quality_mapping": cut_quality_mapping,
    "shape_mapping": shape_mapping,
    "origin_mapping": origin_mapping,
    "color_mapping": color_mapping,
    "color_intensity_mapping": color_intensity_mapping,
    "clarity_mapping": clarity_mapping,
    "treatment_mapping": treatment_mapping,
    "cut_mapping": cut_mapping,
    "type_mapping": type_mapping,
    "global_mean": global_mean_value,
}

# Save as a JSON file
with open("encoded_features.json", "w") as file:
    json.dump(all_mappings, file, indent=4)


{'Poor': 0, 'Fair': 1, 'Good': 2, 'Very Good': 3, 'Excellent': 4}
Shape Mapping: {'Asscher - Octagon': 1889.0093200000001, 'Cushion': 2549.411798124655, 'Emerald Cut': 1765.0123803526449, 'Fancy': 1022.3779069767442, 'Heart': 2969.8839344262296, 'Marquise': 2188.5925925925926, 'Oval': 2103.4730839633903, 'Pear': 1972.5375070821528, 'Princess': 1213.6666666666667, 'Radiant': 1335.191882951654, 'Round': 1738.3217583892617, 'Trillion': 820.7027027027027}
Origin Mapping: {'Africa': 579.0555555555555, 'Australia': 886.0, 'Burma (Myanmar)': 5404.807127071823, 'Cambodia': 1771.0, 'Ceylon (Sri Lanka)': 1559.0118137254901, 'Colombia': 1757.0, 'East Africa': 1749.857142857143, 'Ethiopia': 3907.0, 'Madagascar': 2891.460524691358, 'Montana': 1210.521327014218, 'Mozambique': 5780.212653778559, 'Nigeria': 1206.6, 'Tajikistan': 3700.0, 'Tanzania': 1630.3772727272728, 'Thailand': 1337.5103896103897, 'Thailand (Siam)': 3904.2280701754385, 'Unknown': 8000.0, 'West Africa': 3300.0}
Global Mean: 2013.7229

In [31]:
from xgboost import XGBRegressor
import xgboost
from sklearn.metrics import *
model = XGBRegressor(
    n_estimators=20,
    random_state=42,
    learning_rate=0.4
    )
y_train_log = np.log1p(y_train)
y_test_log = np.log1p(y_test)
model.fit(X_train, y_train_log)
y_predict = model.predict(X_test)
y_predict_train = model.predict(X_train)

# Predicting the accuracy score for Gradient Booster
score=r2_score(y_test_log, y_predict)
score_train = r2_score(y_train_log, y_predict_train)
print('-------Gradient Booster-------')
print('r2_score is', score)
print('r2_score for training is', score_train)
print('mean_squared_error is==', mean_squared_error(y_test_log,y_predict))
print('root_mean_squared_error is==',np.sqrt(mean_squared_error(y_test_log, y_predict)))

-------Gradient Booster-------
r2_score is 0.9626681166535379
r2_score for training is 0.975218895488836
mean_squared_error is== 0.0938804507385865
root_mean_squared_error is== 0.3063991689587074


In [32]:
# Save the trained model to a pickle file
with open("xgb_price_predict_model.pkl", "wb") as f:
    pickle.dump(model, f)

print("Model saved successfully!")


Model saved successfully!
