# Model Training:

In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor

from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [2]:
df= pd.read_csv('data/gemstone_cleaned.csv')

In [3]:
df

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,0.70,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.50,2772
3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,1.70,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453
...,...,...,...,...,...,...,...,...,...,...
193540,0.31,Ideal,D,VVS2,61.1,56.0,4.35,4.39,2.67,1130
193541,0.70,Premium,G,VVS2,60.3,58.0,5.75,5.77,3.47,2874
193542,0.73,Very Good,F,SI1,63.1,57.0,5.72,5.75,3.62,3036
193543,0.34,Very Good,D,SI1,62.9,55.0,4.45,4.49,2.81,681


In [4]:
X = df[df.columns[:-1]]
y = df[df.columns[-1]]

In [5]:
X.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55
1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05
2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5
3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71
4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77


In [6]:
y

0         13619
1         13387
2          2772
3           666
4         14453
          ...  
193540     1130
193541     2874
193542     3036
193543      681
193544     2258
Name: price, Length: 193545, dtype: int64

### Automating the feature engineering process:

In [7]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin # To create custom transformer

In [8]:
cat_cols = X.columns[X.dtypes == 'O']
num_cols = X.columns[X.dtypes != 'O']

In [9]:
cat_cols, num_cols

(Index(['cut', 'color', 'clarity'], dtype='object'),
 Index(['carat', 'depth', 'table', 'x', 'y', 'z'], dtype='object'))

In [10]:
cut_categories = ['Fair', 'Good', 'Very Good','Premium','Ideal']
color_categories = ['D', 'E', 'F', 'G', 'H', 'I', 'J']
clarity_categories = ['I1','SI2','SI1','VS2','VS1','VVS2','VVS1','IF']

In [11]:
class VolumeCalculator(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y= None):
        return self

    def transform(self, X):
        X = pd.DataFrame(X, columns= num_cols)
        X['volume'] = X['x'] * X['y'] * X['z']
        X.drop(columns= ['x', 'y', 'z'], inplace= True)
        return X
        
    def get_feature_names_out(self, input_features=None):
        if input_features is not None:
            input_features = list(input_features)
            if 'x' in input_features and 'y' in input_features and 'z' in input_features:
                input_features.remove('x')
                input_features.remove('y')
                input_features.remove('z')
            return input_features + ['volume']
        return ['volume']

### Creating Pipelines:

In [12]:
num_pipeline = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy= 'median')),
        ('vol_calc', VolumeCalculator()),
        ('scaler', StandardScaler())
   ]
)


cat_pipeline = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy= 'most_frequent')),
        ('encoder', OrdinalEncoder(categories= [cut_categories, color_categories, clarity_categories])),
        ('scaler', StandardScaler())
    ]
)

In [13]:
preprocessor = ColumnTransformer(
    [
        ('num_pipeline', num_pipeline, num_cols),
        ('cat_pipeline', cat_pipeline, cat_cols)
    ]
)

#### Train Test Split:

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.30, random_state= 30)

In [16]:
X_train

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
190014,1.21,Very Good,H,VS1,61.1,56.0,6.81,6.88,4.18
132542,0.54,Ideal,E,SI1,62.0,56.0,5.25,5.23,3.25
11330,0.41,Ideal,D,SI2,62.0,57.0,4.77,4.71,2.94
139419,1.71,Premium,D,SI2,61.4,59.0,7.67,7.72,4.73
48497,0.30,Premium,H,VVS1,62.1,58.0,4.27,4.33,2.67
...,...,...,...,...,...,...,...,...,...
66455,0.36,Ideal,H,VVS1,61.0,56.0,4.59,4.61,2.81
46220,0.29,Good,E,VVS1,60.2,61.0,4.24,4.28,2.56
98804,0.33,Premium,G,VS2,61.0,58.0,4.42,4.45,2.70
48045,0.40,Very Good,G,SI1,62.9,57.0,4.67,4.70,2.94


In [17]:
X_train = pd.DataFrame(preprocessor.fit_transform(X_train), columns= preprocessor.get_feature_names_out())
X_test = pd.DataFrame(preprocessor.transform(X_test), columns= preprocessor.get_feature_names_out())

In [18]:
X_train

Unnamed: 0,num_pipeline__carat,num_pipeline__depth,num_pipeline__table,num_pipeline__volume,cat_pipeline__cut,cat_pipeline__color,cat_pipeline__clarity
0,0.908355,-0.664845,-0.642234,0.898573,-1.140172,0.912657,0.681928
1,-0.543977,0.168016,-0.642234,-0.530867,0.873183,-0.936170,-0.648750
2,-0.825773,0.168016,-0.120942,-0.841737,0.873183,-1.552445,-1.314089
3,1.992185,-0.387225,0.921643,2.027961,-0.133495,-1.552445,-1.314089
4,-1.064215,0.260556,0.400351,-1.065472,-0.133495,0.912657,2.012607
...,...,...,...,...,...,...,...
135476,-0.934156,-0.757385,-0.642234,-0.930136,0.873183,0.912657,2.012607
135477,-1.085892,-1.497705,1.964228,-1.104478,-2.146850,-0.936170,2.012607
135478,-0.999185,-0.757385,0.400351,-1.015320,-0.133495,0.296381,0.016589
135479,-0.847449,1.000876,-0.120942,-0.862145,-1.140172,0.296381,-0.648750


In [19]:
# Linear Regression
# Decision Tree Regressor
# SVM regressor
# KNN Regressor
# Random Forest Regressor
# Gboost
#XGB
#Adaboost

#PM
# r2 score, mse, mae,

In [20]:
models = {
    'Linear Regression' : LinearRegression(),
    'Decision Tree Regressor' : DecisionTreeRegressor(),
    'Knn Regressor' : KNeighborsRegressor(),
    'Random Forest Regressor': RandomForestRegressor(),
    'Adaboost' : AdaBoostRegressor(),
    'Gradient Boost' : GradientBoostingRegressor(),
    'XGBoost' : XGBRegressor(),
}

### Model Score:

In [21]:
def get_score(y_test, y_pred):
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)

    return mse, mae, rmse, r2

### Model training:

In [22]:
def evaluate_model(models, X_train, X_test, y_train, y_test):
    
    accuracy_score = {}
    
    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        mse, mae, rmse, r2 = get_score(y_test, y_pred)

        accuracy_score[name] = {'MSE' : mse, 'MAE' : mae, 'RMSE' : rmse, 'R2 Score' : r2}

    return accuracy_score



### Print Evaluation Score:

In [23]:
def print_score(accuracy_score):
    for name, metrics in accuracy_score.items():
        print('Model:', name)
        print('Model Training Performance')
        for metric, values in metrics.items():
            print(f'{metric}: {values}')
        print('='*35, '\n')

### Print Best Model:

In [24]:
def best_model(accuracy_score):
    max_r2 = -float('inf')
    
    for model, scores in accuracy_score.items():
        r2_score = scores['R2 Score']

        if r2_score > max_r2:
            max_r2 = r2_score
            best_model = model
    
    print(best_model,':', max_r2)
    
    

In [25]:
accuracy_score = evaluate_model(models, X_train, X_test, y_train, y_test)

In [26]:
print_score(accuracy_score)

Model: Linear Regression
Model Training Performance
MSE: 1202469.237949344
MAE: 807.0959250787977
RMSE: 1096.5715835955918
R2 Score: 0.9269065578284975

Model: Decision Tree Regressor
Model Training Performance
MSE: 683527.9559910903
MAE: 424.2041712593001
RMSE: 826.7574952736082
R2 Score: 0.9584509860651048

Model: Knn Regressor
Model Training Performance
MSE: 471331.5657584734
MAE: 368.2246589969688
RMSE: 686.5359173113038
R2 Score: 0.9713495817954372

Model: Random Forest Regressor
Model Training Performance
MSE: 380515.36008352286
MAE: 316.92958673844294
RMSE: 616.8592708904705
R2 Score: 0.9768699467812871

Model: Adaboost
Model Training Performance
MSE: 2136411.7188194003
MAE: 989.09261052789
RMSE: 1461.6469200252845
R2 Score: 0.8701358159562129

Model: Gradient Boost
Model Training Performance
MSE: 378689.8500891336
MAE: 331.5612206521301
RMSE: 615.3778108521086
R2 Score: 0.9769809124550835

Model: XGBoost
Model Training Performance
MSE: 343442.9456777319
MAE: 300.7118678435901
R

In [27]:
best_model(accuracy_score)

XGBoost : 0.9791234135627747


In [28]:
# def evaluate_model(models, X_train, X_test, y_train, y_test):
#     accuracy_scores = {}

#     for name, model in models.items():
#         model.fit(X_train, y_train)
#         y_pred = model.predict(X_test)
#         mse, mae, rmse, r2 = get_score(y_test, y_pred)
#         accuracy_scores[name] = {'MSE' : mse, 'MAE' : mae, 'RMSE' : rmse, 'R2 Score' : r2}
    
#     for name, scores in accuracy_scores.items():
#         print('Model:', name)
#         print('Model Training Performance')
#         for metric, value in scores.items():
#             print(f'{metric}: {value}')
#         print('=' * 35, '\n')

In [29]:
# accuracy_score = {}

# for name, model in models.items():
#     model_obj = model
#     model_obj.fit(X_train, y_train)
#     y_pred= model_obj.predict(X_test)
#     # Accuracy Score:
#     mse, mae, rmse, r2 = get_score(y_test, y_pred)

#     accuracy_score[name] = {'MSE' : mse, 'MAE' : mae, 'RMSE' : rmse, 'R2 Score' : r2}

    

# for name, score in accuracy_score.items():
#     print('Model:', name)
#     print('Model Training Performance')
#     for metric, value in score.items():
#         print(f'{metric}: {value}')
#     print('='*35)
#     print('\n')