## Model Traning

In [52]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("Diamonds Prices2022.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,4,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [3]:
df.drop("Unnamed: 0",axis=1,inplace=True)

Drop the duplicate data in the given dataset

In [4]:
df.duplicated().sum()

149

In [5]:
df.drop_duplicates(inplace=True)

In [6]:
df.duplicated().sum()

0

In [7]:
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [8]:
## Split the data into independent and dependent variable
X = df.drop('price',axis=1)
Y = df['price']

In [9]:
X.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,4.34,4.35,2.75


In [10]:
Y.head()

0    326
1    326
2    327
3    334
4    335
Name: price, dtype: int64

In [33]:
## Segregate numerical and categorical columns

numerical_columns = X.select_dtypes(exclude="object").columns
categorical_columns = X.select_dtypes(include="object").columns
print("Numerical columns :",numerical_columns)
print("Ctegorical columns :",categorical_columns)

Numerical columns : Index(['carat', 'depth', 'table', 'x', 'y', 'z'], dtype='object')
Ctegorical columns : Index(['cut', 'color', 'clarity'], dtype='object')


In [34]:
## Defining the custom ranking for each of the ordinal variable

cut_categories = ["Fair","Good","Very Good","Premium","Ideal"]
color_categoty = ["D","E","F","G","H","I","J"]
clarity_categories = ["I1","SI2","SI1","VS2","VS1","VVS2","VVS1","IF"]

In [35]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

## Pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [47]:
numerical_pipeline = Pipeline(
    steps=[
        ("imputer",SimpleImputer(strategy='median')),
        ("scaler",StandardScaler(with_mean=False))
    ]
)

categorical_pipeline = Pipeline(
    steps=[
        ("imputer",SimpleImputer(strategy='most_frequent')),
        ('ordinalEncoder',OneHotEncoder(categories=[cut_categories,color_categoty,clarity_categories])),
        ('scaler',StandardScaler(with_mean=False))
    ]
)

preprocessor = ColumnTransformer([
    ("numerical_pipeline",numerical_pipeline,numerical_columns),
    ('categorical_pipeline',categorical_pipeline,categorical_columns)
])

In [48]:
## Train Test split
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.30,random_state=42)

In [49]:
x_train = pd.DataFrame(preprocessor.fit_transform(x_train),columns=preprocessor.get_feature_names_out())
x_test = pd.DataFrame(preprocessor.transform(x_test),columns=preprocessor.get_feature_names_out())

In [None]:
x_train

Unnamed: 0,numerical_pipeline__carat,numerical_pipeline__depth,numerical_pipeline__table,numerical_pipeline__x,numerical_pipeline__y,numerical_pipeline__z,categorical_pipeline__cut_Fair,categorical_pipeline__cut_Good,categorical_pipeline__cut_Very Good,categorical_pipeline__cut_Premium,...,categorical_pipeline__color_I,categorical_pipeline__color_J,categorical_pipeline__clarity_I1,categorical_pipeline__clarity_SI2,categorical_pipeline__clarity_SI1,categorical_pipeline__clarity_VS2,categorical_pipeline__clarity_VS1,categorical_pipeline__clarity_VVS2,categorical_pipeline__clarity_VVS1,categorical_pipeline__clarity_IF
0,0.629117,43.149389,25.517266,3.799844,3.828370,3.799734,0.0,0.0,0.0,0.000000,...,0.0,0.00000,0.0,0.0,2.325065,0.000000,0.000000,0.000000,0.0,0.0
1,2.118028,43.079455,25.069595,5.735279,5.711466,5.692432,0.0,0.0,0.0,0.000000,...,0.0,0.00000,0.0,0.0,0.000000,2.382507,0.000000,0.000000,0.0,0.0
2,2.118028,43.638928,26.860280,5.602107,5.649288,5.663754,0.0,0.0,0.0,2.289703,...,0.0,0.00000,0.0,0.0,0.000000,0.000000,0.000000,3.441341,0.0,0.0
3,0.524264,43.079455,24.621924,3.604525,3.624072,3.598993,0.0,0.0,0.0,0.000000,...,0.0,0.00000,0.0,0.0,0.000000,2.382507,0.000000,0.000000,0.0,0.0
4,0.650088,43.568994,26.412609,3.853113,3.810605,3.857088,0.0,0.0,0.0,2.289703,...,0.0,0.00000,0.0,0.0,0.000000,2.382507,0.000000,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37650,2.097057,42.240244,25.964938,5.779670,5.738114,5.620739,0.0,0.0,0.0,2.289703,...,0.0,0.00000,0.0,0.0,0.000000,2.382507,0.000000,0.000000,0.0,0.0
37651,1.321146,43.149389,24.174252,4.900734,4.938686,4.903808,0.0,0.0,0.0,0.000000,...,0.0,0.00000,0.0,0.0,2.325065,0.000000,0.000000,0.000000,0.0,0.0
37652,0.671058,43.219323,24.577156,3.897504,3.926078,3.900104,0.0,0.0,0.0,0.000000,...,0.0,4.48352,0.0,0.0,2.325065,0.000000,0.000000,0.000000,0.0,0.0
37653,1.887351,43.918665,26.412609,5.442300,5.356165,5.477352,0.0,0.0,0.0,2.289703,...,0.0,4.48352,0.0,0.0,2.325065,0.000000,0.000000,0.000000,0.0,0.0


In [51]:
from sklearn.linear_model import LinearRegression,Ridge,Lasso,ElasticNet
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

In [53]:
def evalulateModel(true_data,prediciton):
    mae = mean_absolute_error(true_data,prediciton)
    mse = mean_squared_error(true_data,prediciton)
    rmse = np.sqrt(mse)
    r2 = r2_score(true_data,prediciton)
    
    return mae,mse,rmse,r2

In [54]:
## Traning with multiple model
models = {
    'LinearRegression' : LinearRegression(),
    'Ridge':Ridge(),
    'Lasso':Lasso(),
    'ElasticNet':ElasticNet()
} 

train_model_list = []
model_list = []
r2_list=[]

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(x_train,y_train)
    
    ## Making the predicition
    y_pred = model.predict(x_test)
    
    
    mae,mse,rmse,r2 = evalulateModel(y_test,y_pred)
    
    print(list(models.keys())[i])
    print("Model Traning Performance")
    print(f"Mean Absolute Error: {mae}")
    print(f"Mean Square Error: {mse}")
    print(f"Root Mean Square Error: {rmse}")
    print(f"R2 Score: {r2}")
    
    
    r2_list.append(r2)
    
    print('='*35)
    print('\n')

    
    
    

LinearRegression
Model Traning Performance
Mean Absolute Error: 734.5956378957804
Mean Square Error: 1243335.2684490986
Root Mean Square Error: 1115.0494466386226
R2 Score: 0.9190397001161682


Ridge
Model Traning Performance
Mean Absolute Error: 734.7479222324056
Mean Square Error: 1242251.3427458033
Root Mean Square Error: 1114.5632968772134
R2 Score: 0.9191102803950498


Lasso
Model Traning Performance
Mean Absolute Error: 735.1440690060226
Mean Square Error: 1242183.2417915426
Root Mean Square Error: 1114.5327459485175
R2 Score: 0.9191147148173808


ElasticNet
Model Traning Performance
Mean Absolute Error: 1040.5065498199885
Mean Square Error: 2514176.6704900893
Root Mean Square Error: 1585.61554939717
R2 Score: 0.836288326753803


