In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("data/gemstone.csv")

In [3]:
df = df.drop(labels=['id'],axis=1)

In [4]:
#independent and dependent features
x = df.drop(labels=['price'],axis=1)
y = df['price']

In [5]:
#segrigate categorical and numerical columns
numerical = x.select_dtypes(include='object').columns
categorical = x.select_dtypes(exclude='object').columns

In [6]:
#Define custom ranking for each ordinal variable
cut_categories = ["Fair","Good","Very Good","Premium","Ideal"]
color_categories = ["D","E","F","G","H","I","J"]
clarity_categories = ['I1','SI2','SI1','VS2','VS1','VVS2','VVS1','IF']

In [7]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler,OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [8]:
num_pipeline = Pipeline(
    steps=[
       ('imputer',SimpleImputer(strategy='median')), #For Handling Missing values
       ('scaler', StandardScaler()) 
    ]
)
cat_pipeline= Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='most_frequent')), #For handling missing values
        ('ordinalencoder',OrdinalEncoder(categories=[cut_categories,color_categories,clarity_categories])),
        ('scaler',StandardScaler())
    ]
)
preprocessor = ColumnTransformer(
    [
        ('num_pipeline',num_pipeline,numerical),
        ('cat_pipeline',cat_pipeline,categorical)
    ]
)

In [9]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.30,random_state=30)