In [37]:
import pandas as pd

In [38]:
df=pd.read_csv(r"https://raw.githubusercontent.com/sunnysavita10/Gemstone-Price-Prediction-End-to-End-Pipeline/main/artifacts/raw.csv")

In [39]:
df=df.drop(['id'],axis=1)

In [40]:
df.head(2)

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387


In [41]:
X=df   # 'X' it will be featured data

In [42]:
X.head(2)

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387


In [43]:
Y= df[["price"]] ## 'Y' will be target feature

In [44]:
Y.head(2)

Unnamed: 0,price
0,13619
1,13387


In [45]:
num_col=df.select_dtypes(exclude="object").columns
cat_col=df.select_dtypes(include="object").columns

In [46]:
num_col

Index(['carat', 'depth', 'table', 'x', 'y', 'z', 'price'], dtype='object')

In [47]:
cat_col

Index(['cut', 'color', 'clarity'], dtype='object')

In [48]:
cut_cat=['Fair','Good','Very Good','Premium','Ideal']
clarity_cat=['IF','VVS1','VVS2','VS1','VS2','SI1','SI2','I1']
color_cat=['D','E','F','G','H','I','J']

In [49]:
df.dtypes

carat      float64
cut         object
color       object
clarity     object
depth      float64
table      float64
x          float64
y          float64
z          float64
price        int64
dtype: object

PIPELINE:
1. IMPUTER
2. ENCODING
3. SCALING



In [50]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import  Pipeline
from sklearn.compose  import  ColumnTransformer

In [51]:
num_pip=Pipeline(
    steps=[
    ('imuter',SimpleImputer(strategy='median')),
    ('scaling',StandardScaler()),
    ]
)


cat_pip=Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy="most_frequent")),
    ('encoding',OrdinalEncoder(categories=[cut_cat,color_cat,clarity_cat])),

    ]
)


preprocessor=ColumnTransformer(
        [
            ('numpipeline',num_pip,num_col),
            ('categoricalpipeline',cat_pip,cat_col)
        ]


)

In [52]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.30,random_state=30)

In [53]:
X_train

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
168192,0.34,Ideal,I,VVS2,60.9,57.0,4.56,4.53,2.76,765
35202,0.90,Good,E,SI1,63.8,57.0,6.07,6.03,3.87,4763
41091,1.02,Premium,G,VS1,62.7,58.0,6.35,6.39,4.00,6139
31239,0.32,Premium,G,VS2,62.1,59.0,4.37,4.35,2.71,720
45722,0.35,Ideal,J,VVS2,61.1,56.0,4.53,4.57,2.78,774
...,...,...,...,...,...,...,...,...,...,...
66455,0.31,Ideal,E,SI1,61.8,56.0,4.31,4.35,2.68,544
46220,1.25,Ideal,G,SI2,62.0,56.0,6.88,6.95,4.28,5694
98804,1.00,Good,G,SI1,63.5,56.0,6.29,6.37,4.02,4563
48045,1.10,Ideal,G,VS1,59.9,60.0,6.68,6.77,4.01,6998


In [54]:
X_train=pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())


In [55]:
X_test

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
70432,0.53,Premium,E,VS2,60.8,56.0,5.24,5.21,3.19,1607
64839,0.71,Very Good,H,SI1,62.9,57.0,5.67,5.69,3.56,2211
185316,0.30,Ideal,H,IF,62.1,57.0,4.27,4.29,2.66,764
84658,1.24,Premium,G,VS2,61.6,61.0,6.88,6.82,4.21,7655
31953,0.36,Premium,E,VS1,60.4,58.0,4.60,4.63,2.80,735
...,...,...,...,...,...,...,...,...,...,...
192694,1.26,Very Good,H,VS2,63.1,56.0,6.83,6.87,4.33,5785
133935,0.33,Ideal,F,VVS1,62.1,55.0,4.45,4.48,2.77,914
44763,0.70,Premium,J,VS2,58.2,60.0,5.83,5.80,3.38,2010
29856,0.41,Ideal,G,SI1,61.6,57.0,4.77,4.75,2.93,923


In [58]:
X_test=pd.DataFrame(preprocessor.fit_transform(X_test),columns=preprocessor.get_feature_names_out())                                        ## using tranform  to overcome data leakage problem

In [59]:
X_train.drop(labels="numpipeline__price",axis=1)    ##DROP THE TARGET FEATURE
X_test.drop(labels="numpipeline__price",axis=1) 

Unnamed: 0,numpipeline__carat,numpipeline__depth,numpipeline__table,numpipeline__x,numpipeline__y,numpipeline__z,categoricalpipeline__cut,categoricalpipeline__color,categoricalpipeline__clarity
0,-0.560473,-0.946668,-0.632686,-0.425330,-0.459668,-0.498885,3.0,1.0,4.0
1,-0.171673,0.990913,-0.111947,-0.037825,-0.024281,0.040318,2.0,4.0,5.0
2,-1.057274,0.252787,-0.111947,-1.299471,-1.294159,-1.271258,4.0,4.0,0.0
3,0.973129,-0.208542,1.971009,1.052598,1.000693,0.987567,3.0,3.0,4.0
4,-0.927674,-1.315731,0.408792,-1.002083,-0.985760,-1.067235,3.0,1.0,3.0
...,...,...,...,...,...,...,...,...,...
58067,1.016329,1.175444,-0.632686,1.007539,1.046046,1.162444,2.0,4.0,4.0
58068,-0.992474,0.252787,-1.153426,-1.137259,-1.121819,-1.110954,4.0,2.0,1.0
58069,-0.193273,-3.345577,1.450270,0.106363,0.075496,-0.221997,3.0,6.0,4.0
58070,-0.819674,-0.208542,-0.111947,-0.848883,-0.876913,-0.877785,4.0,3.0,5.0
