In [43]:
import pandas as pd

In [44]:
df = pd.read_csv('data/gemstone.csv')
df.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [45]:
df = df.drop('id', axis=1)
df = df.drop('x', axis=1)
df = df.drop('y', axis=1)
df = df.drop('z', axis=1)
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price
0,1.52,Premium,F,VS2,62.2,58.0,13619
1,2.03,Very Good,J,SI2,62.0,58.0,13387
2,0.7,Ideal,G,VS1,61.2,57.0,2772
3,0.32,Ideal,G,VS1,61.6,56.0,666
4,1.7,Premium,G,VS2,62.6,59.0,14453


In [46]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1:]

In [47]:
cat_features = df.columns[df.dtypes == 'object']
num_features = df.columns[df.dtypes != 'object']
num_features = num_features.drop('price')
cat_features, num_features

(Index(['cut', 'color', 'clarity'], dtype='object'),
 Index(['carat', 'depth', 'table'], dtype='object'))

In [48]:
# For Scaling the data
from sklearn.preprocessing import StandardScaler

# For Categorical to Numerical transformation
from sklearn.preprocessing import OrdinalEncoder

# For Creating Pipleines
from sklearn.pipeline import Pipeline

# Combining Pipelines
from sklearn.compose import ColumnTransformer

# Handling Missing values
from sklearn.impute import SimpleImputer

In [49]:
num_pipeline = Pipeline(
    steps= [
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ]
)

cat_pipeline = Pipeline(
    steps= [
        ('impute', SimpleImputer(strategy='most_frequent')),
        ('encode', OrdinalEncoder()),
        ('scaler', StandardScaler())
    ]
)

preprocessor = ColumnTransformer([
        ('num_pipeline', num_pipeline, num_features),
        ('cat_pipeline', cat_pipeline, cat_features)
    ]
)

In [50]:
# spliting the data into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, t_test = train_test_split(X, y, test_size=0.3, random_state=404)

In [51]:
X_train = pd.DataFrame(preprocessor.fit_transform(X_train), columns=preprocessor.get_feature_names_out())
X_test = pd.DataFrame(preprocessor.transform(X_test), columns=preprocessor.get_feature_names_out())
X_train 

Unnamed: 0,num_pipeline__carat,num_pipeline__depth,num_pipeline__table,cat_pipeline__cut,cat_pipeline__color,cat_pipeline__clarity
0,-0.521078,0.716163,-0.118151,1.585949,0.299510,-0.499301
1,-0.542737,0.808347,0.402756,-0.624614,1.533159,-1.112031
2,-1.062565,-2.325914,-0.118151,-0.624614,-0.317314,0.113429
3,0.475258,-2.786835,1.444571,1.585949,0.916335,0.726159
4,-1.040906,0.623979,-0.639059,-0.624614,-1.550962,-0.499301
...,...,...,...,...,...,...
135496,0.237004,-0.390047,0.402756,1.585949,0.299510,0.726159
135497,-1.062565,-0.390047,0.402756,0.480667,-1.550962,-1.112031
135498,1.579892,-2.049362,0.402756,1.585949,-0.934138,-1.112031
135499,-0.434440,0.255242,-0.639059,1.585949,-0.934138,0.726159
