In [2]:
import pandas as pd
import numpy as np


import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_absolute_error,mean_squared_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.preprocessing import StandardScaler,OrdinalEncoder
from sklearn.impute import SimpleImputer

In [3]:
my_data = pd.read_csv('E:\\my_complete_fsdsm_project\\notebooks\\data\\train.csv')

In [4]:
my_data = my_data.drop(columns=['id'], axis=1)

In [5]:
x = my_data.iloc[:,0:-1]
y = my_data['price']

In [6]:
cat_cols = my_data.select_dtypes(include='object').columns
num_cols = my_data.select_dtypes(exclude='object').columns

In [7]:
cat_cols

Index(['cut', 'color', 'clarity'], dtype='object')

In [12]:
num_cols = num_cols.drop('price')

In [13]:
num_cols

Index(['carat', 'depth', 'table', 'x', 'y', 'z'], dtype='object')

In [16]:
num_pipeline = Pipeline(
     steps=[
          ('imputer', SimpleImputer(strategy='mean')),
          ('scaler', StandardScaler())
     ]
)

In [17]:
cat_pipeline = Pipeline(
     steps=[
          ('imputer', SimpleImputer(strategy='most_frequent')),
          ('encoder', OrdinalEncoder())
     ]
)

In [18]:
preproccesor = ColumnTransformer(
     transformers=[
          ('numerical_col', num_pipeline, num_cols),
          ('category_cols', cat_pipeline, cat_cols)
     ]
)

In [19]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.20,random_state=25)

In [20]:
proccessed_x_train = preproccesor.fit_transform(x_train)

In [24]:
proccessed_x_train

array([[-0.56300284, -0.11351087, -0.6394654 , ...,  2.        ,
         2.        ,  5.        ],
       [-0.43311366,  0.44191924, -1.1609363 , ...,  2.        ,
         3.        ,  2.        ],
       [-1.06091137, -0.11351087, -0.6394654 , ...,  2.        ,
         4.        ,  7.        ],
       ...,
       [-1.06091137,  0.81220598, -1.6824072 , ...,  2.        ,
         4.        ,  1.        ],
       [-0.8444294 , -0.66894099, -0.6394654 , ...,  2.        ,
         1.        ,  7.        ],
       [-0.8444294 , -0.11351087, -0.6394654 , ...,  4.        ,
         3.        ,  6.        ]])

In [21]:
proccessed_x_test = preproccesor.transform(x_test)

In [31]:
x_train = pd.DataFrame(data=proccessed_x_train, columns=preproccesor.get_feature_names_out())
x_test = pd.DataFrame(data=proccessed_x_test, columns=preproccesor.get_feature_names_out())

In [32]:
x_train.head(5)

Unnamed: 0,numerical_col__carat,numerical_col__depth,numerical_col__table,numerical_col__x,numerical_col__y,numerical_col__z,category_cols__cut,category_cols__color,category_cols__clarity
0,-0.563003,-0.113511,-0.639465,-0.454625,-0.470928,-0.469375,2.0,2.0,5.0
1,-0.433114,0.441919,-1.160936,-0.310282,-0.298436,-0.251773,2.0,3.0,2.0
2,-1.060911,-0.113511,-0.639465,-1.266552,-1.269839,-1.26725,2.0,4.0,7.0
3,-1.125856,0.534491,-1.160936,-1.419916,-1.406017,-1.354291,2.0,3.0,1.0
4,0.454462,-0.483798,0.403476,0.600881,0.65481,0.575115,4.0,2.0,3.0


In [33]:
x_test.head()

Unnamed: 0,numerical_col__carat,numerical_col__depth,numerical_col__table,numerical_col__x,numerical_col__y,numerical_col__z,category_cols__cut,category_cols__color,category_cols__clarity
0,-0.995967,0.071632,-0.117995,-1.167317,-1.151818,-1.136689,2.0,0.0,5.0
1,-0.931022,-0.298654,-0.639465,-1.004931,-0.997483,-1.020634,2.0,0.0,2.0
2,1.55852,-0.761513,-0.639465,1.557151,1.526349,1.445524,2.0,3.0,5.0
3,-0.151687,-0.298654,0.924947,-0.003554,0.046548,-0.005157,4.0,4.0,2.0
4,-0.151687,1.182493,-0.639465,-0.012575,0.010234,0.09639,4.0,1.0,2.0


# model building

In [42]:
def evaluate_model(true, predicted):
     mae = mean_absolute_error(true,predicted)
     mse = mean_squared_error(true,predicted)
     rmse = np.sqrt(mean_squared_error(true,predicted))
     r2_sco = r2_score(true,predicted)
     
     return mae, rmse,r2_sco

In [43]:
models = {
     'LinearRegression': LinearRegression(),
     'Lasso': Lasso(),
     'Ridge': Ridge(),
     'ElasticNet': ElasticNet()
}

trained_model_list = []
model_list = []
r2_list = []


for i in range(len(list(models))):
     model = list(models.values())[i]
     model.fit(x_train,y_train)
     
     y_pred = model.predict(x_test)
     
     mae, rmse, r2_square = evaluate_model(y_test,y_pred)
     
     
     print(list(models.keys())[i])
     model_list.append(list(models.keys())[i])
     
     
     print("MODEL TRAINING PERFORMANCE")
     print("RMSE", rmse)
     print("MAE", mae)
     print("R2_score", r2_square*100)
     
     print("=" * 35)
     print('\n')


LinearRegression
MODEL TRAINING PERFORMANCE
RMSE 1113.0464709888704
MAE 706.4677783415627
R2_score 92.53407674723239


Lasso
MODEL TRAINING PERFORMANCE
RMSE 1112.9008395312128
MAE 707.3853210263981
R2_score 92.53603030835451


Ridge
MODEL TRAINING PERFORMANCE
RMSE 1113.049975261584
MAE 706.49085571816
R2_score 92.53402973630696


ElasticNet
MODEL TRAINING PERFORMANCE
RMSE 1597.4909238981998
MAE 1106.8113656149817
R2_score 84.62080066567286


