In [56]:
import pandas as pd 
import numpy as np 

In [13]:
df = pd.read_csv("full_data.csv")
df.head()

Unnamed: 0,type,total_price,carat,price_per_carat,color,shape,length,width,height,clarity,cut,color_intensity,origin,treatment,cut_quality
0,Blue Sapphire,902,0.82,1100.0,Blue,Princess,7.0,7.0,4.21,Very Slightly Included,Princess Cut,Vivid,Thailand,Heated,Good
1,Blue Sapphire,1008,0.84,1200.0,Greyish Blue,Pear,7.33,4.84,3.57,Very Slightly Included,Mixed Brilliant,Medium,Montana,No Enhancement,Excellent
2,Blue Sapphire,1290,1.29,1000.0,Bluish Grey,Cushion,6.86,5.74,3.99,Slightly Included,Mixed Brilliant,Medium Light,Montana,Heated,Good
3,Blue Sapphire,3743,2.13,1757.0,Blue,Princess,7.17,7.13,4.1,Eye Clean,Step Cut,Vivid,Thailand,Heated,Good
4,Blue Sapphire,2314,1.78,1300.0,Greyish Blue,Cushion,8.37,6.48,3.68,Very Slightly Included,Mixed Brilliant,Intense,Montana,No Enhancement,Good


In [96]:
# Split data into training and testing data
from sklearn.model_selection import train_test_split

data = df.drop(['length', 'width', 'height'], axis='columns')
X = df[['carat','price_per_carat', 'color', 'shape', 'clarity', 'cut', 'color_intensity', 'origin', 'treatment','cut_quality','type']]
y = df['total_price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=True)

In [58]:
from sklearn.preprocessing import OrdinalEncoder
oe = OrdinalEncoder(categories=[['Poor', 'Fair', 'Good','Very Good','Excellent']])
X_train['cut_quality_encoded'] = oe.fit_transform(X_train[['cut_quality']])
X_test['cut_quality_encoded'] = oe.transform(X_test[['cut_quality']])

# Handle target
type_mean = X_train.groupby('type')['price_per_carat'].mean()
shape_means = X_train.groupby('shape')['price_per_carat'].mean()
origin_means = X_train.groupby('origin')['price_per_carat'].mean()
color_means = X_train.groupby('color')['price_per_carat'].mean()
color_intensity_means = X_train.groupby('color_intensity')['price_per_carat'].mean()
clarity_means = X_train.groupby('clarity')['price_per_carat'].mean()
treatment_means = X_train.groupby('treatment')['price_per_carat'].mean()
cut_means = X_train.groupby('cut')['price_per_carat'].mean()

# Map target encoding to the training data
X_train['shape_encoded'] = X_train['shape'].map(shape_means)
X_train['origin_encoded'] = X_train['origin'].map(origin_means)
X_train['color_encoded'] = X_train['color'].map(color_means)
X_train['color_intensity_encoded'] = X_train['color_intensity'].map(color_intensity_means)
X_train['clarity_encoded'] = X_train['clarity'].map(clarity_means)
X_train['cut_encoded'] = X_train['cut'].map(cut_means)
X_train['treatment_encoded'] = X_train['treatment'].map(treatment_means)
X_train['type_encoded'] = X_train['type'].map(type_mean)

# Map target encoding to the test data
# Handle unseen categories by filling with global mean
global_mean = X_train['price_per_carat'].mean()
X_test['shape_encoded'] = X_test['shape'].map(shape_means).fillna(global_mean)
X_test['origin_encoded'] = X_test['origin'].map(origin_means).fillna(global_mean)
X_test['color_encoded'] = X_test['color'].map(color_means).fillna(global_mean)
X_test['color_intensity_encoded'] = X_test['color_intensity'].map(color_intensity_means).fillna(global_mean)
X_test['clarity_encoded'] = X_test['clarity'].map(clarity_means).fillna(global_mean)
X_test['cut_encoded'] = X_test['cut'].map(cut_means).fillna(global_mean)
X_test['treatment_encoded'] = X_test['treatment'].map(treatment_means).fillna(global_mean)
X_test['type_encoded'] = X_test['type'].map(type_mean).fillna(global_mean)

X_train = X_train.drop(columns=['price_per_carat', 'color', 'shape', 'color_intensity', 'origin', 'cut', 'treatment', 'clarity','cut_quality','type'])
X_test = X_test.drop(columns=['price_per_carat', 'color', 'shape', 'color_intensity', 'origin', 'cut', 'treatment', 'clarity','cut_quality','type'])
print(X_train.dtypes)

carat                      float64
cut_quality_encoded        float64
shape_encoded              float64
origin_encoded             float64
color_encoded              float64
color_intensity_encoded    float64
clarity_encoded            float64
cut_encoded                float64
treatment_encoded          float64
type_encoded               float64
dtype: object


In [None]:
from xgboost import XGBRegressor
import xgboost
from sklearn.metrics import *
model = XGBRegressor(
    n_estimators=20,
    random_state=42,
    learning_rate=0.4
    )

model.fit(X_train, y_train)
y_predict = model.predict(X_test)
y_predict_train = model.predict(X_train)

# Predicting the accuracy score for Gradient Booster
score=r2_score(y_test, y_predict)
score_train = r2_score(y_train, y_predict_train)
print('-------Gradient Booster-------')
print('r2_score is', score)
print('r2_score for training is', score_train)
print('mean_squared_error is==', mean_squared_error(y_test,y_predict))
print('root_mean_squared_error is==',np.sqrt(mean_squared_error(y_test, y_predict)))

-------Gradient Booster-------
r2_score is 0.8336389660835266
r2_score for training is 0.9866280555725098
mean_squared_error is== 178383520.0
root_mean_squared_error is== 13356.029350072573


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import TargetEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer

# Define the column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', TargetEncoder())
    ])

# Define the pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression())
])

In [127]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import TargetEncoder
from xgboost import XGBRegressor

# Categorical columns to target encode
categorical_columns = ['shape', 'origin', 'color', 'color_intensity', 'clarity', 'cut', 'treatment', 'type', 'cut_quality']

# Fit the TargetEncoder directly
target_encoder = TargetEncoder(target_type='continuous')
target_encoder.fit(X_train[categorical_columns], y=X_train['price_per_carat'])

# Now create the preprocessing steps with the fitted encoder
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', target_encoder, categorical_columns)
    ])

# Create a pipeline with the preprocessor and model
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', XGBRegressor())
])

# Fit the pipeline:
# Here, we use 'total_price' as the target for the model training
pipeline.fit(X_train, y_train)

# After fitting, you can transform your data and predict:
X_train_transformed = pipeline.named_steps['preprocessor'].transform(X_train)
X_test_transformed = pipeline.named_steps['preprocessor'].transform(X_test)

"""# Predict on transformed data
y_pred = pipeline.predict(X_test)"""

'# Predict on transformed data\ny_pred = pipeline.predict(X_test)'

In [134]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import TargetEncoder
from xgboost import XGBRegressor
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import RandomForestRegressor
# Categorical columns to target encode
categorical_columns = ['shape', 'origin', 'color', 'color_intensity', 'clarity', 'cut', 'treatment', 'type', 'cut_quality']

# Columns to keep (all columns except 'price_per_carat')
all_columns = X_train.columns.tolist()
columns_to_keep = [col for col in all_columns if col != 'price_per_carat']

# Fit the TargetEncoder directly
target_encoder = TargetEncoder(target_type='continuous')
target_encoder.fit(X_train[categorical_columns], y=X_train['price_per_carat'])

# Custom transformer to drop 'price_per_carat'
class ColumnDropper(BaseEstimator, TransformerMixin):
    def __init__(self, columns_to_drop):
        self.columns_to_drop = columns_to_drop

    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X.drop(columns=self.columns_to_drop)

# Now create the preprocessing steps with the fitted encoder
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', target_encoder, categorical_columns)
    ],
    remainder='passthrough'  # Pass through all other columns not specified in transformers
)

# Create a pipeline with the preprocessor and model
pipeline = Pipeline([
    ('drop_columns', ColumnDropper(columns_to_drop=['price_per_carat'])),  # Corrected here
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor())
])

# Fit the pipeline:
# Here, we use 'total_price' as the target for the model training
pipeline.fit(X_train, y_train)

# After fitting, you can transform your data and predict:
X_train_transformed = pipeline.named_steps['preprocessor'].transform(X_train)
X_test_transformed = pipeline.named_steps['preprocessor'].transform(X_test)


In [135]:
# Fit the pipeline:
pipeline.fit(X_train, y_train)

# Predict on the test set
y_pred = pipeline.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
#rmse = mean_squared_error(y_test, y_pred, squared=False)  # Root Mean Squared Error
r2 = r2_score(y_test, y_pred)

# Print evaluation metrics
print(f"Mean Squared Error: {mse}")
#print(f"Root Mean Squared Error: {rmse}")
print(f"R-squared Score: {r2}")

# If you want to see the feature importance from XGBoost (only after fitting)
feature_importance = pipeline.named_steps['regressor'].feature_importances_
feature_names = pipeline.named_steps['preprocessor'].get_feature_names_out()

# Combine feature names with their importance
feature_importance_dict = dict(zip(feature_names, feature_importance))
sorted_features = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)

print("\nFeature Importances:")
for feature, importance in sorted_features[:10]:  # Print top 10 features
    print(f"{feature}: {importance}")

Mean Squared Error: 225357139.49700257
R-squared Score: 0.7898311942857734

Feature Importances:
remainder__carat: 0.6768466730554533
cat__cut_quality: 0.08375362835299761
cat__origin: 0.05999553680432098
cat__shape: 0.05577224384957885
cat__color: 0.04515137948735051
cat__treatment: 0.022432265729702867
cat__type: 0.020827851591269538
cat__clarity: 0.016079614011922906
cat__color_intensity: 0.01036731246828619
cat__cut: 0.0087734946491173
