In [1]:
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


In [2]:
# Load the dataset
df = pd.read_csv("gemstone.csv")

# Display the first few rows to verify
df.head()


Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [3]:
# Define features (X) and target variable (y)
X = df.drop(columns=['price'])
y = df['price']

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [4]:
# List the categorical and numerical features
categorical_features = ['cut', 'color', 'clarity']
numerical_features = ['carat', 'depth', 'table', 'x', 'y', 'z']

# Create a ColumnTransformer for preprocessing
preprocessor = ColumnTransformer([
    ('num', 'passthrough', numerical_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
])


In [5]:
# Create a pipeline that applies preprocessing and trains a RandomForestRegressor
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(n_estimators=100, random_state=42))
])

# Train the model on the training data
pipeline.fit(X_train, y_train)


In [6]:
# Make predictions on the test set
y_pred = pipeline.predict(X_test)

# Calculate evaluation metrics
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Model Evaluation:")
print(f" - Mean Squared Error: {mse:.2f}")
print(f" - Mean Absolute Error: {mae:.2f}")
print(f" - R^2 Score: {r2:.2f}")


Model Evaluation:
 - Mean Squared Error: 369655.19
 - Mean Absolute Error: 310.09
 - R^2 Score: 0.98


In [7]:
# Save the trained pipeline to a pickle file
with open("model.pkl", "wb") as file:
    pickle.dump(pipeline, file)

print("Model trained and saved as model.pkl successfully!")


Model trained and saved as model.pkl successfully!
