## Model Training

In [152]:
import pandas as pd
df=pd.read_csv("./data/gemstone.csv")
df.head()


Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [153]:
df=df.drop(labels=['id'], axis=1)

In [154]:
## Independent and dependent features

x = df.drop(labels=['price'], axis=1)
Y = df[['price']]

In [155]:
# Define which columns should be ordinal-encoded and which should be scaled

categorical_cols=x.select_dtypes(include='object').columns
numerical_cols=x.select_dtypes(exclude='object').columns

In [156]:
# Define the custom ranking for each ordinal variable.

cut_categories=['Fair','Good','Very Good','Premium','Ideal']
color_categories=["D","E","F","G","H","I","J"]
clarity_categories=['I1','SI2','SI1','VS2',"VS1",'VVS2',"VVS1","IF"]

#Now it is very important to understand how i automate this entire process.

In [157]:
#Now we will learn a machine learning teachnique to fill the missing values:

from sklearn.impute import SimpleImputer


## It is the Univariate imputer for completing values with simple stretegies(mean,median and most frequent).
## If we are trying to handle missing values at any point of time, directly use simple imputer.

In [158]:
# After handling missing values we do feature scalling.
# for feature scalling liberary we have is:

from sklearn.preprocessing import StandardScaler

In [159]:
# In feature scalling we do ordianl encoding.
# mapping we done manually could be done in a automated way, in the ranking way.
# for that the liberary we have is:

from sklearn.preprocessing import OrdinalEncoder

In [160]:
# Now one very import thing:
# 1. Ist handled misssing values.
# 2. Then feature scaling is performed
# 3. 3rd Ordinal Encoder

# This needs to happen in an order.
# This pipeline needs to get combined.
# We are combining each step one after the other.
# means simple imputer outcome will go the standard scaler, Standard scaler output wil go to the ordinal encoder.
# for this we will be implimenting pipelines (pipelines means combining multiple steps).
# for pipleines we need to import 2 more liberaries

## Pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [161]:
# Now go ahead and create pipeline:

## Neumerical pipeline.
num_pipeline=Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='median')),
        ('scaler',StandardScaler())
    ]
)

## categorical pipeline.
cat_pipeline=Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='most_frequent')),
        ('ordinalencoder',OrdinalEncoder(categories=[cut_categories,color_categories,clarity_categories])),
        ('scaling',StandardScaler())
    ]
)

preprocessor=ColumnTransformer([
('num_pipeline',num_pipeline,numerical_cols),
('cat_pipeline',cat_pipeline,categorical_cols)
])

#finally whatever steps will now happen will now happen to this preprocessor.

In [162]:
## Train test split.

from sklearn.model_selection import train_test_split

x_train,x_test,Y_train,Y_test=train_test_split(x,Y,test_size=0.30,random_state=42)

In [163]:
# Do we need to do .fit_transform w.r.t preprocessor

x_train=pd.DataFrame(preprocessor.fit_transform(x_train),columns=preprocessor.get_feature_names_out())
x_test=pd.DataFrame(preprocessor.transform(x_test),columns=preprocessor.get_feature_names_out())

#automatically missing values are handled, feature scaling is also done because of this pipeline.

In [164]:
x_train=pd.DataFrame(preprocessor.fit_transform(x_train),columns=preprocessor.get_feature_names_out())
x_test=pd.DataFrame(preprocessor.fit_transform(x_test),columns=preprocessor.get_feature_names_out())

ValueError: A given column is not a column of the dataframe

In [165]:
x_train

Unnamed: 0,num_pipeline__carat,num_pipeline__depth,num_pipeline__table,num_pipeline__x,num_pipeline__y,num_pipeline__z,cat_pipeline__cut,cat_pipeline__color,cat_pipeline__clarity
0,-0.823144,-1.129988,-0.641897,-0.780451,-0.835103,-0.876024,0.874100,-0.936747,1.350746
1,0.945023,-1.777823,0.921902,1.073226,1.166389,0.946633,-1.137644,0.910853,0.684455
2,1.958484,0.165682,0.400636,1.703116,1.755063,1.742237,-0.131772,0.910853,0.018164
3,-0.995648,-0.574701,-0.641897,-1.122391,-1.161138,-1.165334,0.874100,-0.320880,2.017037
4,-0.995648,0.258230,0.400636,-1.176382,-1.152082,-1.136403,-1.137644,1.526720,-0.648127
...,...,...,...,...,...,...,...,...,...
135496,-0.629077,-1.500179,1.964434,-0.546492,-0.518125,-0.644575,-1.137644,-0.936747,-0.648127
135497,2.411307,0.443325,2.485700,1.919078,1.872797,1.930288,-1.137644,-0.320880,-0.648127
135498,0.923460,0.906065,0.400636,0.992240,0.921862,1.047891,-0.131772,0.294987,0.018164
135499,-1.038774,-0.667249,-0.641897,-1.212375,-1.197364,-1.252127,-1.137644,0.294987,2.017037


In [166]:
x_test

Unnamed: 0,num_pipeline__carat,num_pipeline__depth,num_pipeline__table,num_pipeline__x,num_pipeline__y,num_pipeline__z,cat_pipeline__cut,cat_pipeline__color,cat_pipeline__clarity
0,-0.629077,0.258230,-0.120630,-0.600482,-0.581521,-0.572248,0.874100,-1.552614,-0.648127
1,2.605374,-2.148014,-0.120630,2.126042,2.198832,1.959219,-1.137644,0.294987,-1.314417
2,-1.125026,-1.222536,0.921902,-1.374347,-1.414721,-1.469110,-0.131772,-0.936747,2.017037
3,-1.017211,-0.574701,0.921902,-1.158385,-1.161138,-1.194265,-0.131772,1.526720,2.017037
4,0.858771,0.628421,-0.641897,0.947248,0.985258,1.004495,0.874100,0.910853,-0.648127
...,...,...,...,...,...,...,...,...,...
58067,0.255007,0.535873,0.921902,0.416340,0.369414,0.425874,-1.137644,1.526720,-1.314417
58068,-0.607514,0.535873,-0.641897,-0.528495,-0.554351,-0.499920,0.874100,-1.552614,0.018164
58069,-0.823144,-0.019414,-0.641897,-0.834441,-0.862273,-0.847093,0.874100,0.294987,2.017037
58070,0.901897,-0.667249,1.443168,1.046230,0.967145,0.932167,-0.131772,1.526720,-0.648127


In [167]:
#Model training

from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

In [168]:
regression=LinearRegression()
regression.fit(x_train,Y_train)

In [169]:
regression.coef_


array([[ 6432.97591819,  -132.34206204,   -70.48787525, -1701.38593925,
         -494.17005097,   -76.32351645,    68.80035873,  -464.67990411,
          652.10059539]])

In [170]:
regression.intercept_

array([3976.8787389])

In [177]:
#So i will create a function which will check model performance:

import numpy as np
def evaluate_model(true,predicted):
    mae=mean_absolute_error(true, predicted)
    mse=mean_squared_error(true, predicted)
    rmse=np.sqrt(mean_squared_error(true,predicted))
    r2_square= r2_score(true,predicted)
    return mae, rmse, r2_square


In [183]:
## Train multiple models: which ever model will have higher efficiency we will take that model.

models={
    "LinearRegression":LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "Elasticnet": ElasticNet()
}

model_list=[]
r2_list=[]

for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(x_train,Y_train)

    #make prediction
    Y_pred=model.predict(x_test)

    mae, rmse, r2_square=evaluate_model(Y_test,Y_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print("Model Training Performance")
    print("RMSE",rmse)
    print("MAE",mae)
    print("r2 score",r2_square*100)
    
    r2_list.append(r2_square)
    print("="*35)
    print('\n')


LinearRegression
Model Training Performance
RMSE 1014.6296630375463
MAE 675.0758270067483
r2 score 93.62906819996049


Lasso
Model Training Performance
RMSE 1014.6591302750638
MAE 676.2421173665509
r2 score 93.62869814082755


Ridge
Model Training Performance
RMSE 1014.6343233534415
MAE 675.1077629781366
r2 score 93.6290096749163


Elasticnet
Model Training Performance
RMSE 1533.3541245902313
MAE 1060.9432977143008
r2 score 85.44967219374031




In [184]:
model_list

['LinearRegression', 'Lasso', 'Ridge', 'Elasticnet']