In [94]:
import pandas as pd

In [95]:
df = pd.read_csv("data/gemstone.csv")

In [96]:
df.drop(columns='id',inplace = True)

In [97]:
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [98]:
X = df[['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'x', 'y', 'z',]]
y = df['price']

In [99]:
categorical_columns = [column for column in X.columns if X[column].dtype == 'object' ]
numerical_columns = [column for column in X.columns if X[column].dtype != 'object' ]
print(numerical_columns)

['carat', 'depth', 'table', 'x', 'y', 'z']


In [100]:
# Define the custom ranking for each ordinal variable
cut_categories = ['Fair', 'Good', 'Very Good','Premium','Ideal']
color_categories = ['D', 'E', 'F', 'G', 'H', 'I', 'J']
clarity_categories = ['I1','SI2','SI1','VS2','VS1','VVS2','VVS1','IF']

In [101]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [102]:
num_pipeline = Pipeline(
    steps = [
        ('imputer',SimpleImputer(strategy='median')),
        ('scaler',StandardScaler())
    ]
)

cat_pipeline = Pipeline(
    steps = [
        ('imputer',SimpleImputer(strategy='most_frequent')),
        ('encoding',OrdinalEncoder(categories=[cut_categories,color_categories,clarity_categories])),
        ('scaler',StandardScaler())
    ]
)

preprocessor = ColumnTransformer([
    ('num_pipeline',num_pipeline,numerical_columns),
    ('cat_pipeline',cat_pipeline,categorical_columns)
]
)

In [103]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=42)

In [104]:
X_train.shape,X_test.shape

((145179, 9), (48394, 9))

In [105]:
X_train=pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test=pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

In [106]:
X_train

Unnamed: 0,num_pipeline__carat,num_pipeline__depth,num_pipeline__table,num_pipeline__x,num_pipeline__y,num_pipeline__z,cat_pipeline__cut,cat_pipeline__color,cat_pipeline__clarity
0,0.472019,0.257973,-0.640440,0.606460,0.624162,0.644271,0.873371,1.527634,-1.315033
1,0.623099,0.812463,-1.161268,0.732515,0.787278,0.803549,0.873371,0.911526,0.017494
2,2.651895,-1.867571,1.442873,2.236173,2.318751,2.005371,-0.132110,1.527634,0.017494
3,-1.017203,1.089708,-0.640440,-1.248351,-1.233542,-1.151223,-1.137590,0.295418,-0.648770
4,-0.110720,1.459368,3.526185,0.039212,-0.010176,0.137478,-2.143070,-0.936798,-1.315033
...,...,...,...,...,...,...,...,...,...
145174,-0.628711,-1.497911,1.963701,-0.546044,-0.517646,-0.644430,-1.137590,-0.936798,-0.648770
145175,2.414483,0.442803,2.484529,1.921035,1.874714,1.932972,-1.137590,-0.320690,-0.648770
145176,0.925260,0.904878,0.401217,0.993629,0.923207,1.049705,-0.132110,0.295418,0.017494
145177,-1.038786,-0.666177,-0.640440,-1.212335,-1.197294,-1.252581,-1.137590,0.295418,2.016286


## MODEL TRANING

In [107]:
from sklearn.linear_model import LinearRegression
LR1 = LinearRegression()
LR1.fit(X_train,y_train)

In [108]:
y_test_pred = LR1.predict(X_test)

In [109]:
from sklearn.metrics import r2_score
score1 = r2_score(y_test,y_test_pred)
score1

0.9361253831282494

In [110]:
## Adjusted R-Squared

Adj_score1 = 1 - ((1-(score1**2))*(len(df)-1)/(len(df)-9-1))
Adj_score1

0.8763249827502325

In [111]:
from sklearn.linear_model import Ridge,Lasso,ElasticNet
RR1 = Ridge()
LR1 = Lasso()
ENR = ElasticNet()

In [112]:
RR1.fit(X_train,y_train)

In [113]:
y_test_pred = RR1.predict(X_test)

In [114]:
from sklearn.metrics import r2_score
score2 = r2_score(y_test,y_test_pred)
score2

0.9361250093640314

In [115]:
Adj_score2 = 1 - ((1-(score2**2))*(len(df)-1)/(len(df)-9-1))
Adj_score2

0.8763242829374912

In [116]:
LR1.fit(X_train,y_train)

In [117]:
y_test_pred = LR1.predict(X_test)

In [118]:
from sklearn.metrics import r2_score
score3 = r2_score(y_test,y_test_pred)
score3

0.9361106358005803

In [119]:
Adj_score3 = 1 - ((1-(score3**2))*(len(df)-1)/(len(df)-9-1))
Adj_score3

0.876297370988397

In [120]:
ENR.fit(X_train,y_train)

In [121]:
y_test_pred = ENR.predict(X_test)

In [122]:
from sklearn.metrics import r2_score
score4 = r2_score(y_test,y_test_pred)
score4

0.8541191791514899

In [123]:
from sklearn.decomposition import PCA
pca = PCA(n_components=5)

In [124]:
X_train = pca.fit_transform(X_train)

In [125]:
X_test = pca.transform(X_test)

In [126]:
from sklearn.linear_model import LinearRegression
LR2 = LinearRegression()
LR2.fit(X_train,y_train)

In [127]:
y_test_pred = LR2.predict(X_test)

In [128]:
from sklearn.metrics import r2_score
score6 = r2_score(y_test,y_test_pred)
score6

0.8714900093387105