In [13]:
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score

In [14]:
df = sns.load_dataset('diamonds')

In [15]:
df #loading the dataset

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75
...,...,...,...,...,...,...,...,...,...,...
53935,0.72,Ideal,D,SI1,60.8,57.0,2757,5.75,5.76,3.50
53936,0.72,Good,D,SI1,63.1,55.0,2757,5.69,5.75,3.61
53937,0.70,Very Good,D,SI1,62.8,60.0,2757,5.66,5.68,3.56
53938,0.86,Premium,H,SI2,61.0,58.0,2757,6.15,6.12,3.74


**Goal:** Predict the price of a diamond.

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53940 entries, 0 to 53939
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype   
---  ------   --------------  -----   
 0   carat    53940 non-null  float64 
 1   cut      53940 non-null  category
 2   color    53940 non-null  category
 3   clarity  53940 non-null  category
 4   depth    53940 non-null  float64 
 5   table    53940 non-null  float64 
 6   price    53940 non-null  int64   
 7   x        53940 non-null  float64 
 8   y        53940 non-null  float64 
 9   z        53940 non-null  float64 
dtypes: category(3), float64(6), int64(1)
memory usage: 3.0 MB


In [17]:
numerical_values = df.select_dtypes(include='number').columns
category_values = df.select_dtypes(exclude='number').columns

df = pd.get_dummies(df,columns=category_values,drop_first=True)

In [18]:
df

Unnamed: 0,carat,depth,table,price,x,y,z,cut_Premium,cut_Very Good,cut_Good,...,color_H,color_I,color_J,clarity_VVS1,clarity_VVS2,clarity_VS1,clarity_VS2,clarity_SI1,clarity_SI2,clarity_I1
0,0.23,61.5,55.0,326,3.95,3.98,2.43,False,False,False,...,False,False,False,False,False,False,False,False,True,False
1,0.21,59.8,61.0,326,3.89,3.84,2.31,True,False,False,...,False,False,False,False,False,False,False,True,False,False
2,0.23,56.9,65.0,327,4.05,4.07,2.31,False,False,True,...,False,False,False,False,False,True,False,False,False,False
3,0.29,62.4,58.0,334,4.20,4.23,2.63,True,False,False,...,False,True,False,False,False,False,True,False,False,False
4,0.31,63.3,58.0,335,4.34,4.35,2.75,False,False,True,...,False,False,True,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53935,0.72,60.8,57.0,2757,5.75,5.76,3.50,False,False,False,...,False,False,False,False,False,False,False,True,False,False
53936,0.72,63.1,55.0,2757,5.69,5.75,3.61,False,False,True,...,False,False,False,False,False,False,False,True,False,False
53937,0.70,62.8,60.0,2757,5.66,5.68,3.56,False,True,False,...,False,False,False,False,False,False,False,True,False,False
53938,0.86,61.0,58.0,2757,6.15,6.12,3.74,True,False,False,...,True,False,False,False,False,False,False,False,True,False


In [19]:
#setting the testing and traininbg split

y = df['price']
X = df.drop('price',axis=1)

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [20]:
#loading the Linear Regressor model

model = LinearRegression()
model.fit(X_train,y_train)
predictions = model.predict(X_test)

#checking the metrics
mae = mean_absolute_error(y_test,predictions)
r2 = r2_score(y_test, predictions)

print(f"Mean Absolute Error: ${mae:.2f}") 
print(f"R2 Score: {r2:.4f}")

Mean Absolute Error: $737.15
R2 Score: 0.9189


In [21]:
#loading the Random Regressor model

model = RandomForestRegressor(random_state=42)
model.fit(X_train,y_train)
predictions = model.predict(X_test)

#checking the metrics
mae = mean_absolute_error(y_test,predictions)
r2 = r2_score(y_test, predictions)

print(f"Mean Absolute Error: ${mae:.2f}") 
print(f"R2 Score: {r2:.4f}")

Mean Absolute Error: $277.05
R2 Score: 0.9805


In [22]:
mean_price = df['price'].mean()
print(f"Average Price: ${mean_price:.2f}")

# Calculate error percentage
error_percent = (277.05 / mean_price) * 100
print(f"Error Percentage: {error_percent:.2f}%")

Average Price: $3932.80
Error Percentage: 7.04%


In [32]:
#Lets try finding a predicted value

print(X_train.columns.tolist())

['carat', 'depth', 'table', 'x', 'y', 'z', 'cut_Premium', 'cut_Very Good', 'cut_Good', 'cut_Fair', 'color_E', 'color_F', 'color_G', 'color_H', 'color_I', 'color_J', 'clarity_VVS1', 'clarity_VVS2', 'clarity_VS1', 'clarity_VS2', 'clarity_SI1', 'clarity_SI2', 'clarity_I1']


In [31]:
my_diamond = X_train.iloc[[0]].copy()

new_diamond = pd.DataFrame({
    'carat': [1.2],
    'depth': [61.5],
    'table': [55.0],
    'x': [6.8], 
    'y': [6.8], 
    'z': [4.2],
    
    # Set the flags found in your list
    'cut_Premium': [1],    # We want Premium
    'color_H': [1],        # We want Color H
    'clarity_VS1': [1]     # We want Clarity VS1
})

# 2. Align with the model (Fills missing columns like 'cut_Fair' with 0)
final_input = new_diamond.reindex(columns=X_train.columns, fill_value=0)

# 3. Predict
price = model.predict(final_input)
print(f"Predicted Price (Premium): ${price[0]:.2f}")

Predicted Price (Premium): $6856.86
