In [44]:
import pandas as pd
import numpy as np
# Load the dataset into a pandas DataFrame
df = pd.read_csv('Diamonds.csv')
df

Unnamed: 0,carat,cut,color,clarity,price,x,y,z
0,0.23,Ideal,E,SI2,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,334,4.20,4.23,2.63
4,0.31,Good,J,SI2,335,4.34,4.35,2.75
...,...,...,...,...,...,...,...,...
53935,0.72,Ideal,D,SI1,2757,5.75,5.76,3.50
53936,0.72,Good,D,SI1,2757,5.69,5.75,3.61
53937,0.70,Very Good,D,SI1,2757,5.66,5.68,3.56
53938,0.86,Premium,H,SI2,2757,6.15,6.12,3.74


In [45]:
df.isnull().sum()

carat      0
cut        0
color      0
clarity    0
price      0
x          0
y          0
z          0
dtype: int64

In [46]:
# Removing Outliers using z-score
from scipy import stats
df = df[(np.abs(stats.zscore(df['price'])) < 3)]
df

Unnamed: 0,carat,cut,color,clarity,price,x,y,z
0,0.23,Ideal,E,SI2,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,334,4.20,4.23,2.63
4,0.31,Good,J,SI2,335,4.34,4.35,2.75
...,...,...,...,...,...,...,...,...
53935,0.72,Ideal,D,SI1,2757,5.75,5.76,3.50
53936,0.72,Good,D,SI1,2757,5.69,5.75,3.61
53937,0.70,Very Good,D,SI1,2757,5.66,5.68,3.56
53938,0.86,Premium,H,SI2,2757,6.15,6.12,3.74


In [47]:
# Removing Outliers using IQR
Q1 = df['price'].quantile(0.25)
Q3 = df['price'].quantile(0.75)
IQR = Q3 - Q1
df = df[~((df['price'] < (Q1 - 1.5 * IQR)) | (df['price'] > (Q3 + 1.5 * IQR)))]
df

Unnamed: 0,carat,cut,color,clarity,price,x,y,z
0,0.23,Ideal,E,SI2,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,334,4.20,4.23,2.63
4,0.31,Good,J,SI2,335,4.34,4.35,2.75
...,...,...,...,...,...,...,...,...
53935,0.72,Ideal,D,SI1,2757,5.75,5.76,3.50
53936,0.72,Good,D,SI1,2757,5.69,5.75,3.61
53937,0.70,Very Good,D,SI1,2757,5.66,5.68,3.56
53938,0.86,Premium,H,SI2,2757,6.15,6.12,3.74


In [30]:
df.dtypes

carat      float64
cut         object
color       object
clarity     object
price        int64
x          float64
y          float64
z          float64
dtype: object

In [48]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
# Example features for scaling
features = ['carat', 'price', 'x', 'y', 'z']
# Standardization (Z-score Normalization)
scaler_standard = StandardScaler()
df_standardized = df.copy()
df_standardized[features] = scaler_standard.fit_transform(df[features])
# Min-Max Normalization
scaler_minmax = MinMaxScaler()
df_normalized = df.copy()
df_normalized[features] = scaler_minmax.fit_transform(df[features])
# Display the first few rows of the scaled data
print("Standardized Data:")
print(df_standardized.head())
print("\nNormalized Data:")
print(df_normalized.head())

Standardized Data:
      carat      cut color clarity     price         x         y         z
0 -1.292740    Ideal     E     SI2 -1.037406 -1.644363 -1.618550 -1.616972
1 -1.345930  Premium     E     SI1 -1.037406 -1.705399 -1.761083 -1.809424
2 -1.292740     Good     E     VS1 -1.037030 -1.542635 -1.526921 -1.809424
3 -1.133171  Premium     I     VS2 -1.034396 -1.390043 -1.364025 -1.296219
4 -1.079981     Good     J     SI2 -1.034020 -1.247624 -1.241853 -1.103767

Normalized Data:
      carat      cut color clarity     price         x         y         z
0  0.010309    Ideal     E     SI2  0.000000  0.414046  0.125157  0.076415
1  0.003436  Premium     E     SI1  0.000000  0.407757  0.120755  0.072642
2  0.010309     Good     E     VS1  0.000091  0.424528  0.127987  0.072642
3  0.030928  Premium     I     VS2  0.000729  0.440252  0.133019  0.082704
4  0.037801     Good     J     SI2  0.000820  0.454927  0.136792  0.086478


In [49]:
# Apply one-hot encoding to the 'cut', 'color', and 'clarity' columns
df_encoded = pd.get_dummies(df, columns=['cut', 'color', 'clarity'], drop_first=True)
df_encoded

Unnamed: 0,carat,price,x,y,z,cut_Good,cut_Ideal,cut_Premium,cut_Very Good,color_E,...,color_H,color_I,color_J,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
0,0.23,326,3.95,3.98,2.43,False,True,False,False,True,...,False,False,False,False,False,True,False,False,False,False
1,0.21,326,3.89,3.84,2.31,False,False,True,False,True,...,False,False,False,False,True,False,False,False,False,False
2,0.23,327,4.05,4.07,2.31,True,False,False,False,True,...,False,False,False,False,False,False,True,False,False,False
3,0.29,334,4.20,4.23,2.63,False,False,True,False,False,...,False,True,False,False,False,False,False,True,False,False
4,0.31,335,4.34,4.35,2.75,True,False,False,False,False,...,False,False,True,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53935,0.72,2757,5.75,5.76,3.50,False,True,False,False,False,...,False,False,False,False,True,False,False,False,False,False
53936,0.72,2757,5.69,5.75,3.61,True,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False
53937,0.70,2757,5.66,5.68,3.56,False,False,False,True,False,...,False,False,False,False,True,False,False,False,False,False
53938,0.86,2757,6.15,6.12,3.74,False,False,True,False,False,...,True,False,False,False,False,True,False,False,False,False


In [51]:
from sklearn.model_selection import train_test_split
features = df.drop('price', axis=1)  # Drop the target variable
target = df['price']  # Target variable
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
# Display the shape of the resulting datasets
print("Training set shape:", X_train.shape)
print("Testing set shape:", X_test.shape)

Training set shape: (39958, 7)
Testing set shape: (9990, 7)


In [52]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
# Encode categorical variables
features_encoded = pd.get_dummies(features)
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features_encoded, target, test_size=0.2, random_state=42)
# Initialize and train the Linear Regression model
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)
# Predict on the test set
predictions = linear_model.predict(X_test)
# Evaluate the model
mse = mean_squared_error(y_test, predictions)
print(F'Linear Regression Mean Squared Error: {mse}')

Linear Regression Mean Squared Error: 561057.6234536672


In [54]:
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.linear_model import Ridge, Lasso
from sklearn.metrics import mean_squared_error

# Define the parameter distributions for Ridge and Lasso
param_distributions_ridge = {
    'alpha': np.logspace(-4, 4, 100)  
}

param_distributions_lasso = {
    'alpha': np.logspace(-4, 4, 100) 
}

# Initialize models
ridge_model = Ridge()
lasso_model = Lasso()

# Set up Randomized Search with Cross-Validation for Ridge Regression
random_search_ridge = RandomizedSearchCV(estimator=ridge_model, param_distributions=param_distributions_ridge, n_iter=50, cv=5, scoring='neg_mean_squared_error', random_state=42)
random_search_ridge.fit(X_train, y_train)

# Set up Randomized Search with Cross-Validation for Lasso Regression
random_search_lasso = RandomizedSearchCV(estimator=lasso_model, param_distributions=param_distributions_lasso, n_iter=50, cv=5, scoring='neg_mean_squared_error', random_state=42)
random_search_lasso.fit(X_train, y_train)

# Get the best parameters and scores for Ridge Regression
best_params_ridge = random_search_ridge.best_params_
best_score_ridge = -random_search_ridge.best_score_

# Get the best parameters and scores for Lasso Regression
best_params_lasso = random_search_lasso.best_params_
best_score_lasso = -random_search_lasso.best_score_

# Evaluate the models on the test set
best_ridge_model = random_search_ridge.best_estimator_
best_lasso_model = random_search_lasso.best_estimator_

ridge_predictions = best_ridge_model.predict(X_test)
lasso_predictions = best_lasso_model.predict(X_test)

mse_ridge = mean_squared_error(y_test, ridge_predictions)
mse_lasso = mean_squared_error(y_test, lasso_predictions)

print(f'Ridge Regression Best Parameters: {best_params_ridge}')
print(f'Ridge Regression Best Score (Cross-Validation MSE): {best_score_ridge}')
print(f'Ridge Regression Test Set MSE: {mse_ridge}')

print(f'\nLasso Regression Best Parameters: {best_params_lasso}')
print(f'Lasso Regression Best Score (Cross-Validation MSE): {best_score_lasso}')
print(f'Lasso Regression Test Set MSE: {mse_lasso}')

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

Ridge Regression Best Parameters: {'alpha': 2.782559402207126}
Ridge Regression Best Score (Cross-Validation MSE): 583276.0157719763
Ridge Regression Test Set MSE: 561084.6237214651

Lasso Regression Best Parameters: {'alpha': 0.9111627561154896}
Lasso Regression Best Score (Cross-Validation MSE): 580467.7887421141
Lasso Regression Test Set MSE: 561293.3430675052


  model = cd_fast.enet_coordinate_descent(


In [55]:
from sklearn.linear_model import Ridge
# Initialize the Ridge Regression model
ridge_model = Ridge(alpha=1.0)
# Train the model using the training data
ridge_model.fit(X_train, y_train)
# Predict the target variable on the test data
ridge_predictions = ridge_model.predict(X_test)

In [56]:
from sklearn.linear_model import Lasso
# Initialize the Lasso Regression model
lasso_model = Lasso(alpha=0.1)  
# Train the model using the training data
lasso_model.fit(X_train, y_train)
# Predict the target variable on the test data
lasso_predictions = lasso_model.predict(X_test)

  model = cd_fast.enet_coordinate_descent(


In [57]:
from sklearn.metrics import mean_squared_error, r2_score
# Evaluate Ridge Regression
mse_ridge = mean_squared_error(y_test, ridge_predictions)
r2_ridge = r2_score(y_test, ridge_predictions)
print(f'Ridge Regression Mean Squared Error: {mse_ridge}')
print(f'Ridge Regression R^2 Score: {r2_ridge}')

# Evaluate Lasso Regression
mse_lasso = mean_squared_error(y_test, lasso_predictions)
r2_lasso = r2_score(y_test, lasso_predictions)
print(f'Lasso Regression Mean Squared Error: {mse_lasso}')
print(f'Lasso Regression R^2 Score: {r2_lasso}')

Ridge Regression Mean Squared Error: 561048.9591886444
Ridge Regression R^2 Score: 0.9188294326115042
Lasso Regression Mean Squared Error: 561018.8427669953
Lasso Regression R^2 Score: 0.918833789748244


In [58]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
# Calculate metrics for Ridge Regression
mae_ridge = mean_absolute_error(y_test, ridge_predictions)
mse_ridge = mean_squared_error(y_test, ridge_predictions)
rmse_ridge = np.sqrt(mse_ridge)
r2_ridge = r2_score(y_test, ridge_predictions)

print(f'Ridge Regression Mean Absolute Error (MAE): {mae_ridge}')
print(f'Ridge Regression Mean Squared Error (MSE): {mse_ridge}')
print(f'Ridge Regression Root Mean Squared Error (RMSE): {rmse_ridge}')
print(f'Ridge Regression R-squared (R²): {r2_ridge}')

# Calculate metrics for Lasso Regression
mae_lasso = mean_absolute_error(y_test, lasso_predictions)
mse_lasso = mean_squared_error(y_test, lasso_predictions)
rmse_lasso = np.sqrt(mse_lasso)
r2_lasso = r2_score(y_test, lasso_predictions)

print(f'\nLasso Regression Mean Absolute Error (MAE): {mae_lasso}')
print(f'Lasso Regression Mean Squared Error (MSE): {mse_lasso}')
print(f'Lasso Regression Root Mean Squared Error (RMSE): {rmse_lasso}')
print(f'Lasso Regression R-squared (R²): {r2_lasso}')

Ridge Regression Mean Absolute Error (MAE): 519.2830293668853
Ridge Regression Mean Squared Error (MSE): 561048.9591886444
Ridge Regression Root Mean Squared Error (RMSE): 749.0320147955256
Ridge Regression R-squared (R²): 0.9188294326115042

Lasso Regression Mean Absolute Error (MAE): 518.9346656995976
Lasso Regression Mean Squared Error (MSE): 561018.8427669953
Lasso Regression Root Mean Squared Error (RMSE): 749.0119109647025
Lasso Regression R-squared (R²): 0.918833789748244


In [61]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import Ridge, Lasso
from sklearn.preprocessing import StandardScaler

# Scale features
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features_encoded)
# Define models
ridge_model = Ridge(alpha=1.0)  
lasso_model = Lasso(alpha=0.1)  
# Set up K-Fold Cross-Validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)  # 5-Fold Cross-Validation

# Evaluate Ridge Regression with Cross-Validation
ridge_scores = cross_val_score(ridge_model, features_scaled, target, cv=kf, scoring='neg_mean_squared_error')
ridge_mse_scores = -ridge_scores  # Convert to positive MSE
ridge_rmse_scores = np.sqrt(ridge_mse_scores)

print(f'Ridge Regression 5-Fold Cross-Validation MSE Scores: {ridge_mse_scores}')
print(f'Ridge Regression 5-Fold Cross-Validation RMSE Scores: {ridge_rmse_scores}')
print(f'Ridge Regression Mean RMSE: {np.mean(ridge_rmse_scores)}')

# Evaluate Lasso Regression with Cross-Validation
lasso_scores = cross_val_score(lasso_model, features_scaled, target, cv=kf, scoring='neg_mean_squared_error')
lasso_mse_scores = -lasso_scores  # Convert to positive MSE
lasso_rmse_scores = np.sqrt(lasso_mse_scores)

print(f'Lasso Regression 5-Fold Cross-Validation MSE Scores: {lasso_mse_scores}')
print(f'Lasso Regression 5-Fold Cross-Validation RMSE Scores: {lasso_rmse_scores}')
print(f'Lasso Regression Mean RMSE: {np.mean(lasso_rmse_scores)}')

Ridge Regression 5-Fold Cross-Validation MSE Scores: [561063.29875665 553833.35195452 592548.98708351 590081.47698063
 596332.86153167]
Ridge Regression 5-Fold Cross-Validation RMSE Scores: [749.0415868  744.19980647 769.77203579 768.16760995 772.22591353]
Ridge Regression Mean RMSE: 760.6813905097517


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Lasso Regression 5-Fold Cross-Validation MSE Scores: [561081.217246   553904.72826491 592556.3350215  587509.28126472
 596120.73279534]
Lasso Regression 5-Fold Cross-Validation RMSE Scores: [749.05354765 744.24776    769.77680858 766.49154024 772.08855243]
Lasso Regression Mean RMSE: 760.3316417799793


  model = cd_fast.enet_coordinate_descent(
