In [2]:
import pandas as pd

In [3]:
df=pd.read_csv(r'D:\Notes_DS\ML\Practice\notebooks\data\gemstone.csv')

In [4]:
df.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [6]:
df=df.drop(labels=['id'],axis=1)

In [24]:
X=df.drop(labels=['price'],axis=1)
y=df[['price']]

In [25]:
y

Unnamed: 0,price
0,13619
1,13387
2,2772
3,666
4,14453
...,...
193568,1130
193569,2874
193570,3036
193571,681


In [26]:
categorical_cols=X.select_dtypes(include='object').columns
numerical_cols=X.select_dtypes(exclude='object').columns

In [30]:
cut_categories=['Fair','Good','Very Good','Premium','Ideal']
color_categories=['D','E','F','G','H','I','J']
clarity_categories=['I1','SI2','SI1','VS2','VS1','VVS2','VVS1','IF']

In [31]:
%pip install scikit-learn


Note: you may need to restart the kernel to use updated packages.


In [32]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder

In [33]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [34]:
num_pipeline=Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='median')),
        ('Scaler',StandardScaler())
    ]
)

cat_pipeline=Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy="most_frequent")),
        ('ordinalencoder',OrdinalEncoder(categories=[cut_categories,color_categories,clarity_categories])),
        ('Sclaler',StandardScaler())]
    
)

preprocess=ColumnTransformer([

('num_pipeline',num_pipeline,numerical_cols),
('cat_pipeline',cat_pipeline,categorical_cols)
])


In [42]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=1)


In [43]:
X_train=pd.DataFrame(preprocess.fit_transform(X_train),columns=preprocess.get_feature_names_out())


In [48]:
X_test=pd.DataFrame(preprocess.transform(X_test),columns=preprocess.get_feature_names_out())

In [49]:
X_train.head()

Unnamed: 0,num_pipeline__carat,num_pipeline__depth,num_pipeline__table,num_pipeline__x,num_pipeline__y,num_pipeline__z,cat_pipeline__cut,cat_pipeline__color,cat_pipeline__clarity
0,2.758681,0.631711,-0.119267,2.172812,2.118887,2.220999,-0.135503,1.525503,-0.647427
1,0.903141,0.169421,-0.640178,0.98427,1.040573,1.026856,-1.143448,0.293897,-1.313938
2,1.550422,-0.570243,1.443466,1.515512,1.448339,1.390924,-0.135503,0.293897,-0.647427
3,0.5795,-1.032533,-1.161089,0.768171,0.814036,0.677351,0.872443,-1.553512,-0.647427
4,-0.973975,-0.015495,-0.119267,-1.095679,-1.097932,-1.084738,0.872443,1.525503,-0.647427


In [52]:
from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

In [53]:
regression=LinearRegression()
regression.fit(X_train,y_train)

In [54]:
regression.coef_

array([[ 6428.87814082,   -91.88110272,   -66.15819502, -1508.99672807,
         -275.0081191 ,  -487.32468042,    71.84138685,  -463.98810432,
          652.71737887]])

In [55]:
regression.intercept_

array([3973.59298455])

In [58]:
import numpy as np

def evaluate_model(true,predicted):
    mae=mean_absolute_error(true,predicted)
    mse=mean_squared_error(true,predicted)
    rmse=np.sqrt(mean_squared_error(true,predicted))
    r2_square=r2_score(true,predicted)

    return mae,mse,rmse,r2_square

In [60]:
models={
    'LinearRegression':LinearRegression(),
    'Lasso':Lasso(),
    'Ridge':Ridge(),
    'ElasticNet':ElasticNet()
}

model_list=[]
r2_list=[]


for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_train)

    y_pred=model.predict(X_test)

    mae,mse,rmse,r2_square=evaluate_model(y_test,y_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model Training performance')
    print('Rmse',rmse)
    print('MAE',mae)
    print("R2 score",r2_square*100)
    r2_list.append(r2_square)

    print('='*400)



LinearRegression
Model Training performance
Rmse 1012.422516848521
MAE 675.5049239742397
R2 score 93.658526138559
Lasso
Model Training performance
Rmse 1010.5986257428409
MAE 676.2646246505899
R2 score 93.68135403757478
Ridge
Model Training performance
Rmse 1012.4440136156345
MAE 675.5401609134967
R2 score 93.65825683867334
ElasticNet
Model Training performance
Rmse 1532.9196626545422
MAE 1065.4307895443046
R2 score 85.46197519767333


In [61]:
model_list

['LinearRegression', 'Lasso', 'Ridge', 'ElasticNet']