# Model Training

In [3]:
import pandas as pd


In [4]:
df = pd.read_csv('./data/gemstone.csv')
df.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [5]:
# dropping id 
df = df.drop(labels=['id'], axis=1)
df.head(2)

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387


## Independent and dependent features


In [6]:

X = df.drop(labels=['price'], axis=1)
X

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55
1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05
2,0.70,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.50
3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71
4,1.70,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77
...,...,...,...,...,...,...,...,...,...
193568,0.31,Ideal,D,VVS2,61.1,56.0,4.35,4.39,2.67
193569,0.70,Premium,G,VVS2,60.3,58.0,5.75,5.77,3.47
193570,0.73,Very Good,F,SI1,63.1,57.0,5.72,5.75,3.62
193571,0.34,Very Good,D,SI1,62.9,55.0,4.45,4.49,2.81


In [7]:
Y = df[['price']]
Y

Unnamed: 0,price
0,13619
1,13387
2,2772
3,666
4,14453
...,...
193568,1130
193569,2874
193570,3036
193571,681


In [8]:
 # defining which columns should be ordinal-encoded and which should be scaled

categorical_columns = X.select_dtypes(include='object').columns
numerical_columns = X.select_dtypes(exclude='object').columns
print(f"numerical_columns:\n {numerical_columns}\n")
print(f'categorical_columns:\n {categorical_columns}')

numerical_columns:
 Index(['carat', 'depth', 'table', 'x', 'y', 'z'], dtype='object')

categorical_columns:
 Index(['cut', 'color', 'clarity'], dtype='object')


In [9]:
numerical_columns

Index(['carat', 'depth', 'table', 'x', 'y', 'z'], dtype='object')

In [10]:
numerical_columns[0]


'carat'

In [50]:
type(categorical_columns)

pandas.core.indexes.base.Index

In [51]:
# define the custom ranking for each ordinal variable

cut_categories = ['Fair','Good','Very Good', 'Premium', 'Ideal']
color_categories = ['D','E', 'F', 'G', 'H', 'I', 'J']
clarity_categories = ['I1','SI2', 'SI1',  'VS2' ,'VS1','VVS2','VVS1','IF']


In [52]:
import numpy as np
from sklearn.impute import SimpleImputer # handling missing values
from sklearn.preprocessing import StandardScaler # Handling Feature Scaling
from sklearn.preprocessing import OrdinalEncoder # ordinal encoding


In [53]:
# pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [54]:
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
# bject_SimpleImputer (imp_mean)
imp_mean
imp_mean.fit([[7, 2, 3], [4, np.nan, 6], [10, 5, 9]])
SimpleImputer()
a = [[np.nan, 2, 3], [4, np.nan, 6], [10, 9,np.nan]]
type(a)
print(imp_mean.transform(a))

[[ 7.   2.   3. ]
 [ 4.   3.5  6. ]
 [10.   9.   6. ]]


In [55]:
# creating numerical pipeline

numerical_pipeline = Pipeline(steps=[('imputer', SimpleImputer(strategy='median')),
                ('scaler', StandardScaler())
                


])
numerical_pipeline

In [56]:
# Sequentially apply a list of transforms and a final estimator
categorical_pipeline =Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('ordinalencoder', OrdinalEncoder(categories= [cut_categories,color_categories,clarity_categories])), # sequence array_like 
    ('scaler', StandardScaler())
    ]
)
categorical_pipeline

In [63]:
preprocessor = ColumnTransformer([
('num_pipeline',numerical_pipeline,[i for i in numerical_columns] ), #should pass array for columns Passed in list comprehension
('cat_pipeline', categorical_pipeline, [i for i in categorical_columns])
])
preprocessor

In [64]:
# train test split 
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size=0.30, random_state=30)

In [65]:
X_train = pd.DataFrame(preprocessor.fit_transform(X_train), columns=preprocessor.get_feature_names_out())
X_test = pd.DataFrame(preprocessor.transform(X_test), columns=preprocessor.get_feature_names_out())
# ValueError: A given column is not a column of the dataframe



In [66]:
X_train.head(2)

Unnamed: 0,num_pipeline__carat,num_pipeline__depth,num_pipeline__table,num_pipeline__x,num_pipeline__y,num_pipeline__z,cat_pipeline__cut,cat_pipeline__color,cat_pipeline__clarity
0,-0.975439,-0.849607,-0.121531,-1.042757,-1.08097,-1.12315,0.874076,1.528722,1.352731
1,0.235195,1.833637,-0.121531,0.318447,0.279859,0.485354,-2.144558,-0.935071,-0.646786


In [67]:
X_test.head(2)

Unnamed: 0,num_pipeline__carat,num_pipeline__depth,num_pipeline__table,num_pipeline__x,num_pipeline__y,num_pipeline__z,cat_pipeline__cut,cat_pipeline__color,cat_pipeline__clarity
0,-0.564688,-0.942132,-0.642862,-0.429765,-0.464061,-0.500036,-0.132136,-0.935071,0.01972
1,-0.175556,1.000906,-0.121531,-0.042137,-0.028595,0.036132,-1.138347,0.912774,-0.646786
