In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn


In [2]:
data = pd.read_csv('./train.csv')
data.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [3]:
data = data.drop(labels='id', axis=1)

In [5]:
data.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [6]:
x = data.drop(labels=['price'], axis=1)   
y = data['price']

In [7]:
cat_coloumn = x.select_dtypes(include='object').columns
num_coloumn = x.select_dtypes(exclude='object').columns

In [8]:
data['color'].unique()

array(['F', 'J', 'G', 'E', 'D', 'H', 'I'], dtype=object)

In [9]:
cut_categories = ['Fair','Good','Very Good', 'Premium', 'Ideal']
clarity_categories = ['I1', 'SI2', 'SI1', 'VS2', 'VS1','VVS1', 'VVS2', 'VVS1', 'IF']
color_categories = ['D', 'E', 'F', 'G', 'H', 'I', 'J']

In [10]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# creating the pipeline

In [11]:
num_pipeline = Pipeline(
     steps=[
         ('imputer', SimpleImputer()),
         ('scaler', StandardScaler()),
         
          
     ]
)

In [12]:
cat_pipeline = Pipeline(
     steps=[
          ('imputer', SimpleImputer(strategy='most_frequent')),
          ('encoder', OrdinalEncoder(categories=[cut_categories, color_categories, clarity_categories]))
     ]
)

In [13]:
preproccesor = ColumnTransformer(
     [
          ('num pipeline', num_pipeline, num_coloumn),
          ('cat pipeline',cat_pipeline, cat_coloumn)
          
     ]
)

In [14]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.30, random_state=42)


In [15]:
preproccesor.fit_transform(x_train)

array([[-0.82314374, -1.12998781, -0.64189666, ...,  4.        ,
         1.        ,  6.        ],
       [ 0.94502267, -1.77782269,  0.92190185, ...,  2.        ,
         4.        ,  4.        ],
       [ 1.9584839 ,  0.16568195,  0.40063568, ...,  3.        ,
         4.        ,  3.        ],
       ...,
       [ 0.92345966,  0.90606467,  0.40063568, ...,  3.        ,
         3.        ,  3.        ],
       [-1.03877378, -0.66724861, -0.64189666, ...,  2.        ,
         3.        ,  7.        ],
       [-1.03877378, -0.01941373,  0.92190185, ...,  2.        ,
         3.        ,  1.        ]])

In [16]:
preproccesor.transform(x_test)

array([[-0.62907669,  0.25822979, -0.12063049, ...,  4.        ,
         0.        ,  2.        ],
       [ 2.60537405, -2.14801405, -0.12063049, ...,  2.        ,
         3.        ,  1.        ],
       [-1.1250258 , -1.22253565,  0.92190185, ...,  3.        ,
         1.        ,  7.        ],
       ...,
       [-0.82314374, -0.01941373, -0.64189666, ...,  4.        ,
         3.        ,  7.        ],
       [ 0.90189666, -0.66724861,  1.44316802, ...,  3.        ,
         5.        ,  2.        ],
       [ 0.47063656,  0.90606467, -0.64189666, ...,  2.        ,
         1.        ,  3.        ]])

In [17]:
x_train = pd.DataFrame(preproccesor.fit_transform(x_train), columns=preproccesor.get_feature_names_out())
x_test = pd.DataFrame(preproccesor.transform(x_test), columns=preproccesor.get_feature_names_out())

In [18]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [19]:
def evaluate_model(true, predicted):
     mae = mean_absolute_error(true,predicted)
     mse = mean_squared_error(true,predicted)
     rmse = np.sqrt(mean_squared_error(true,predicted))
     r2_square = r2_score(true,predicted)
     
     return mae, rmse, r2_square

In [20]:
models = {
     'LinearRegression': LinearRegression(),
     'Ridge': Ridge(),
     'Lasso':Lasso(),
     'ElasticNet':ElasticNet()
}

trained_model_list = []
model_list = []
r2_list = []

for i in range(len(list(models))):
     model = list(models.values())[i]
     model.fit(x_train, y_train)
     
     
     y_pred = model.predict(x_test)
     mae, rmse, r2square = evaluate_model(y_test, y_pred)
     
     
     print(list(models.keys())[i])
     model_list.append(list(models.keys())[i])
     
     
     print("MODEL PERFORMANCE")
     print("RMSE", rmse)
     print("MAE", mae)
     print("r2 score", r2square)
     
     print("*"*35)
     print('\n')     

LinearRegression
MODEL PERFORMANCE
RMSE 1031.7453334057955
MAE 686.5778218200917
r2 score 0.934123142698193
***********************************


Ridge
MODEL PERFORMANCE
RMSE 1031.750263830884
MAE 686.6130252651042
r2 score 0.9341225130821907
***********************************


Lasso
MODEL PERFORMANCE
RMSE 1031.7789981930387
MAE 687.9956618652795
r2 score 0.9341188436400928
***********************************


ElasticNet
MODEL PERFORMANCE
RMSE 1517.898581040748
MAE 1056.6609362454367
r2 score 0.8574151588316123
***********************************


