Model Training


In [50]:
import pandas as pd

In [67]:
df= pd.read_csv("data/gemstone.csv")
df.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [52]:
df= df.drop(labels=["id"], axis=1)

In [75]:
X= df.drop(labels=["price"], axis=1)
y= df[["price"]]

In [76]:
#Define which column should be ordinal encoded and which should be scaled
categorical_cols= X.select_dtypes(include="object").columns
numerical_cols= X.select_dtypes(exclude="object").columns

In [77]:
cut_categories= ["Fair", "Good", "Very Good", "Premium", "Idle"]
color_categories= ["D", "E", "F", "G", "H", "I", "J"]
clarity_categories= ["I1", "SI1", "SI2", "VS2", "VS1", "VVS2", "VVS1", "IF"]


In [78]:
from sklearn.impute import SimpleImputer #Handling Missing values
from sklearn.preprocessing import StandardScaler  #Handling Feature Scaling
from sklearn.preprocessing import OrdinalEncoder  #Ordinal Encoding

#Pipeline
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer



In [79]:
#Numerical Pipeline
num_pipeline=Pipeline(
    steps=
    [
    ("imputer",SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())   
    ]
)

#Categorical Pipeline

cat_pipeline= Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("ordinal_encoder", OrdinalEncoder(categories=["cut_categories", "color_categories", "clarity_categories"])),
        ("scaling", StandardScaler())
    ]
)

preprocessor= ColumnTransformer(
    [
    ("num_pipeline", num_pipeline, numerical_cols), 
    ("cat_pipeline", cat_pipeline, categorical_cols)
    ]
)

In [80]:
#Train Test Split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.3, random_state=42)


In [81]:
X_train.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z
11504,11504,0.41,Ideal,E,VVS2,60.6,56.0,4.85,4.8,2.93
95284,95284,1.23,Very Good,H,VS1,59.9,59.0,6.91,7.01,4.19
184777,184777,1.7,Premium,H,VS2,62.0,58.0,7.61,7.66,4.74
5419,5419,0.33,Ideal,F,VVS1,61.2,56.0,4.47,4.44,2.73
45466,45466,0.33,Very Good,I,SI1,62.1,58.0,4.41,4.45,2.75


In [82]:
X_test.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z
14868,14868,0.5,Ideal,D,SI1,62.1,57.0,5.05,5.08,3.14
165613,165613,2.0,Very Good,G,SI2,59.5,57.0,8.08,8.15,4.89
96727,96727,0.27,Premium,E,VVS1,60.5,59.0,4.19,4.16,2.52
145593,145593,0.32,Premium,I,VVS1,61.2,59.0,4.43,4.44,2.71
118689,118689,1.19,Ideal,H,SI1,62.5,56.0,6.77,6.81,4.23


In [83]:
y_train.head()

Unnamed: 0,price
11504,1181
95284,7418
184777,12755
5419,1020
45466,445


In [84]:
y_test.head()

Unnamed: 0,price
14868,1355
165613,14691
96727,844
145593,707
118689,5797


In [85]:
preprocessor.fit_transform(X_train)

IndexError: too many indices for array: array is 0-dimensional, but 1 were indexed