In [1]:
import pandas as pd
import numpy as np

In [2]:
df=pd.read_csv('./data/gemstone.csv')
df.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [3]:
df=df.drop('id',axis=1)
df.sample()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
171194,0.37,Ideal,H,VS1,62.0,55.0,4.68,4.65,2.88,788


In [4]:
# Independent Features
X=df.drop('price',axis=1)
X.sample()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
147329,0.55,Ideal,E,VS2,61.1,57.0,5.27,5.29,3.23


In [5]:
# Dependent Feature
y=df[['price']]
y

Unnamed: 0,price
0,13619
1,13387
2,2772
3,666
4,14453
...,...
193568,1130
193569,2874
193570,3036
193571,681


In [6]:
# Segregating features for preprocessing

categorical_cols=X.select_dtypes(include='object').columns
categorical_cols

Index(['cut', 'color', 'clarity'], dtype='object')

In [7]:

numerical_cols=X.select_dtypes(exclude='object',).columns
numerical_cols

Index(['carat', 'depth', 'table', 'x', 'y', 'z'], dtype='object')

In [8]:
# ranking of categorical features
cut_categories = ['Fair', 'Good', 'Very Good','Premium','Ideal']
color_categories = ['D', 'E', 'F', 'G', 'H', 'I', 'J']
clarity_categories = ['I1','SI2','SI1','VS2','VS1','VVS2','VVS1','IF']


In [9]:
# importing libraries for preprocessing
from sklearn.impute import SimpleImputer 
from sklearn.preprocessing import StandardScaler 
from sklearn.preprocessing import OrdinalEncoder

In [10]:
# Libraries for pipeline
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer


In [11]:
# Numerical Pipeline
num_pipeline=Pipeline(steps=[
   ('imputer',SimpleImputer(strategy='median')),
    ( 'scalar',StandardScaler())
])


# Categorical pipeline

cat_pipeline=Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('ordinalencoder',OrdinalEncoder(categories=[cut_categories,color_categories,clarity_categories])),
    ('scaler',StandardScaler())
])



# Combining transformations into a preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num_pipeline', num_pipeline, numerical_cols),
        ('cat_pipeline', cat_pipeline, categorical_cols)],
    remainder='passthrough'  # Keeps the remaining columns as it is
)



In [12]:
# Train test split

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.30,random_state=30)

In [13]:
X_train=pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test=pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

In [14]:
X_train.head()

Unnamed: 0,num_pipeline__carat,num_pipeline__depth,num_pipeline__table,num_pipeline__x,num_pipeline__y,num_pipeline__z,cat_pipeline__cut,cat_pipeline__color,cat_pipeline__clarity
0,-0.975439,-0.849607,-0.121531,-1.042757,-1.08097,-1.12315,0.874076,1.528722,1.352731
1,0.235195,1.833637,-0.121531,0.318447,0.279859,0.485354,-2.144558,-0.935071,-0.646786
2,0.494617,0.815855,0.3998,0.570855,0.606458,0.673737,-0.132136,0.296826,0.686225
3,-1.018676,0.260701,0.921131,-1.214034,-1.24427,-1.195605,-0.132136,0.296826,0.01972
4,-0.953821,-0.664555,-0.642862,-1.069801,-1.044681,-1.094168,0.874076,2.14467,1.352731


In [15]:
## Model Training

from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error


In [16]:
regression=LinearRegression()
regression.fit(X_train,y_train)

In [17]:
# Intercept 
regression.intercept_

array([3970.76628955])

In [18]:
# Regression coeffients of features
regression.coef_



array([[ 6433.66003594,  -132.75843566,   -70.42922179, -1720.30971463,
         -499.29302619,   -63.39317848,    72.44537247,  -460.41604642,
          650.76431652]])

In [19]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [20]:
# Initialize models
models = {
    'Linear Regression': LinearRegression(),
    'Lasso Regression': Lasso(),
    'Ridge Regression': Ridge(),
    'ElasticNet Regression': ElasticNet()
}

In [21]:
# Train and evaluate each model
for name, model in models.items():
    # Train model
    model.fit(X_train, y_train)
    
    # Predict on test data
    y_pred = model.predict(X_test)
    
    # Calculate metrics
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    
    # Adjusted R²
    n = X_test.shape[0]
    p = X_test.shape[1]
    adj_r2 = 1 - (1 - r2) * (n - 1) / (n - p - 1)
    
    # Results
    print(f"Model: {name}")
    print('='*40)
    print(f"  MAE: {mae:.6f}")
    print(f"  MSE: {mse:.6f}")
    print(f"  RMSE: {rmse:.6f}")
    print(f"  R2: {r2:.6f}")
    print(f"  Adjusted R2: {adj_r2*100:.6f}\n")

    

Model: Linear Regression
  MAE: 674.025512
  MSE: 1028002.759813
  RMSE: 1013.904709
  R2: 0.936891
  Adjusted R2: 93.688104



Model: Lasso Regression
  MAE: 675.071692
  MSE: 1027949.455969
  RMSE: 1013.878423
  R2: 0.936894
  Adjusted R2: 93.688432

Model: Ridge Regression
  MAE: 674.055580
  MSE: 1028005.229368
  RMSE: 1013.905927
  R2: 0.936891
  Adjusted R2: 93.688089

Model: ElasticNet Regression
  MAE: 1060.736876
  MSE: 2351365.382290
  RMSE: 1533.416246
  R2: 0.855649
  Adjusted R2: 85.562711



## Optimizing Model Performance with GridSearchCV and Cross-Validation

In [22]:
from sklearn.model_selection import GridSearchCV, cross_val_score

In [23]:
# Define parameter grids for GridSearchCV
param_grids = {
    'Lasso Regression': {'alpha': [0.001, 0.01, 0.1, 1, 10, 100]},
    'Ridge Regression': {'alpha': [0.001, 0.01, 0.1, 1, 10, 100]},
    'ElasticNet Regression': {'alpha': [0.001, 0.01, 0.1, 1, 10, 100], 'l1_ratio': [0.1, 0.5, 0.9]}
}

# Initialize storage for results
results = {}

# Train, tune, and evaluate each model
for name, model in models.items():
    if name in param_grids:
        # Grid Search CV
        grid_search = GridSearchCV(model, param_grids[name], cv=5, scoring='neg_mean_squared_error')
        grid_search.fit(X_train, y_train.values.ravel())
        best_model = grid_search.best_estimator_
        best_params = grid_search.best_params_
        best_score = grid_search.best_score_
    else:
        best_model = model
        best_params = 'N/A'
        best_score = 'N/A'
    
    # Cross-validation for model performance
    cv_scores = cross_val_score(best_model, X_train, y_train.values.ravel(), cv=5, scoring='neg_mean_squared_error')
    mean_cv_score = np.mean(cv_scores)
    
    # Fit the best model to calculate R²
    best_model.fit(X_train, y_train)
    y_pred = best_model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    
    # Store results
    results[name] = {
        'Best Params': best_params,
        'Best CV Score': best_score,
        'Mean CV Score': mean_cv_score,
        'R² Score': r2
    }
# Print results
for name, metrics in results.items():
    print(f"Model: {name}")
    print(f"  Best Parameters: {metrics['Best Params']}")
    print(f"  Best CV Score (Negative MSE): {metrics['Best CV Score']}")
    print(f"  Mean CV Score (Negative MSE): {metrics['Mean CV Score']}")
    print(f"  R² Score: {metrics['R² Score']*100:.6f}\n")



  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

Model: Linear Regression
  Best Parameters: N/A
  Best CV Score (Negative MSE): N/A
  Mean CV Score (Negative MSE): -1039319.7677805178
  R² Score: 93.689082

Model: Lasso Regression
  Best Parameters: {'alpha': 1}
  Best CV Score (Negative MSE): -1035390.1954545463
  Mean CV Score (Negative MSE): -1035390.1954545463
  R² Score: 93.689410

Model: Ridge Regression
  Best Parameters: {'alpha': 10}
  Best CV Score (Negative MSE): -1039226.994215674
  Mean CV Score (Negative MSE): -1039226.994215674
  R² Score: 93.688884

Model: ElasticNet Regression
  Best Parameters: {'alpha': 0.001, 'l1_ratio': 0.9}
  Best CV Score (Negative MSE): -1037908.3153496258
  Mean CV Score (Negative MSE): -1037908.3153496258
  R² Score: 93.689180



  model = cd_fast.enet_coordinate_descent(


In [24]:
# Determine and print the best model based on mean CV score
best_model_name = max(results, key=lambda k: results[k]['Mean CV Score'])
print(f"Best Model based on Cross-Validation: {best_model_name}")


Best Model based on Cross-Validation: Lasso Regression
