In [1]:
import pandas as pd
import numpy as np
import warnings 
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv("data/gemstone.csv")
df.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [3]:
df.drop(labels="id",axis=1,inplace=True)

In [4]:
x = df.drop("price",axis=1)
y = pd.DataFrame(data=df["price"])

In [5]:
x

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55
1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05
2,0.70,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.50
3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71
4,1.70,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77
...,...,...,...,...,...,...,...,...,...
193568,0.31,Ideal,D,VVS2,61.1,56.0,4.35,4.39,2.67
193569,0.70,Premium,G,VVS2,60.3,58.0,5.75,5.77,3.47
193570,0.73,Very Good,F,SI1,63.1,57.0,5.72,5.75,3.62
193571,0.34,Very Good,D,SI1,62.9,55.0,4.45,4.49,2.81


In [6]:
y

Unnamed: 0,price
0,13619
1,13387
2,2772
3,666
4,14453
...,...
193568,1130
193569,2874
193570,3036
193571,681


In [7]:
numerical_columns = x.columns[x.dtypes!="object"]
categorical_columns = x.columns[x.dtypes=="object"]
print(numerical_columns)
print(categorical_columns)

Index(['carat', 'depth', 'table', 'x', 'y', 'z'], dtype='object')
Index(['cut', 'color', 'clarity'], dtype='object')


In [8]:
cut_map = ["Fair","Good","Very Good","Premium","Ideal"]
clarity_map = ["I1","SI2","SI1","VS2","VS1","VVS2","VVS1","IF"]
color_map = ["D","E","F","G","H","I","J"]   

In [9]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [10]:
num_pipeline = Pipeline(
    steps = [
        ("Imputer",SimpleImputer(strategy="median")),
        ("Scaler",StandardScaler())
    ]
)

cat_pipeline = Pipeline(
    steps = [
        ("Imputer",SimpleImputer(strategy="most_frequent")),
        ("OrdinalEncoder",OrdinalEncoder(categories=[cut_map,color_map,clarity_map])),
        ("Scaler",StandardScaler())
    ]

)

In [11]:
preprocessor = ColumnTransformer([
    ("num_pipeline",num_pipeline,numerical_columns),
    ("cat_pipeline",cat_pipeline,categorical_columns)
])

In [12]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.33,random_state=42)

In [13]:
x_train_processed = pd.DataFrame(preprocessor.fit_transform(x_train),columns=preprocessor.get_feature_names_out())
x_test_processed = pd.DataFrame(preprocessor.transform(x_test),columns=preprocessor.get_feature_names_out())
y_train = y_train.reset_index(drop=True)

In [14]:
x_train_processed

Unnamed: 0,num_pipeline__carat,num_pipeline__depth,num_pipeline__table,num_pipeline__x,num_pipeline__y,num_pipeline__z,cat_pipeline__cut,cat_pipeline__color,cat_pipeline__clarity
0,2.626061,-2.888129,0.400868,2.233112,2.216066,1.856561,-0.130933,1.525655,-1.314696
1,-0.845291,0.164716,0.922458,-0.915966,-0.908068,-0.890852,-0.130933,-0.937159,-0.648656
2,-0.845291,-1.500472,1.965640,-0.843987,-0.899013,-0.963153,-0.130933,-0.321455,-0.648656
3,-0.694363,-0.667878,-0.642314,-0.637048,-0.636405,-0.673951,0.874463,-0.937159,-1.314696
4,1.548002,-0.482857,1.444049,1.477333,1.455407,1.393839,-0.130933,1.525655,0.683424
...,...,...,...,...,...,...,...,...,...
129688,-0.629679,-1.500472,1.965640,-0.547074,-0.518684,-0.645031,-1.136330,-0.937159,-0.648656
129689,2.410449,0.442247,2.487231,1.918204,1.871959,1.928861,-1.136330,-0.321455,-0.648656
129690,0.922727,0.904800,0.400868,0.991476,0.921135,1.046797,-0.130933,0.294248,0.017384
129691,-1.039342,-0.667878,-0.642314,-1.212879,-1.197843,-1.252354,-1.136330,0.294248,2.015504


In [15]:
y_train

Unnamed: 0,price
0,16075
1,904
2,1016
3,1173
4,11655
...,...
129688,1410
129689,15064
129690,7209
129691,816


In [16]:
from sklearn.linear_model import LinearRegression,Ridge,Lasso,ElasticNet
from sklearn.ensemble import GradientBoostingRegressor,AdaBoostRegressor
from xgboost import XGBRegressor

In [17]:
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score

In [18]:
models1 = {
    "LinearRegression":LinearRegression(),
    "RidgeRegression":Ridge(),
    "LassoRegression":Lasso(),
    "ElasticNetRegression":ElasticNet()
}

In [19]:
def model_performance(Models_dict,x_training,y_training,x_testing,y_testing):
    models = Models_dict
    x_train_func = x_training
    y_train_func = y_training
    x_test_func = x_testing
    y_test_func = y_testing
    for i in models.keys():
        model = models[i]
        model.fit(x_train_func,y_train_func)
        y_pred_func = model.predict(x_test_func)
        MAE = mean_absolute_error(y_test_func,y_pred_func)
        MSE = mean_squared_error(y_test_func,y_pred_func)
        r2score = r2_score(y_test_func,y_pred_func)
        print(i)
        print("MAE:",MAE)
        print("MSE:",MSE)
        print("r2_score:",r2score)
        print("================================")

In [20]:
model_performance(models1,x_train_processed,y_train,x_test_processed,y_test)

LinearRegression
MAE: 674.7352796098304
MSE: 1028753.6398275062
r2_score: 0.9363893549824441
RidgeRegression
MAE: 674.7687088427462
MSE: 1028762.3061423723
r2_score: 0.9363888191205456
LassoRegression
MAE: 675.8986621286323
MSE: 1028878.7702079996
r2_score: 0.9363816178295377
ElasticNetRegression
MAE: 1061.3169023914195
MSE: 2353384.972340876
r2_score: 0.854483784776376


In [21]:
models2 = {
    "GradientBoostingRegression":GradientBoostingRegressor(),
    "AdaBoostRegression":AdaBoostRegressor(),
    "XgBoostRegression":XGBRegressor()
}

In [22]:
model_performance(models2,x_train_processed,y_train,x_test_processed,y_test)

GradientBoostingRegression
MAE: 330.8752393132752
MSE: 381584.57647629274
r2_score: 0.976405584292778
AdaBoostRegression
MAE: 924.3730733130429
MSE: 1779485.8179873496
r2_score: 0.8899695356599197
XgBoostRegression
MAE: 298.09049175790346
MSE: 349081.67751095246
r2_score: 0.9784153272361639


<font color = #F94C10,font size=5> We use XgBoost because it has highest r2 score</font>