In [7]:
import pandas as pd 

#Read in the Diamonds Data Set.
data = pd.read_csv('diamonds.csv')
df = pd.DataFrame(data)
df.drop(columns=['Unnamed: 0'], inplace=True)
df.head(50)


Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75
5,0.24,Very Good,J,VVS2,62.8,57.0,336,3.94,3.96,2.48
6,0.24,Very Good,I,VVS1,62.3,57.0,336,3.95,3.98,2.47
7,0.26,Very Good,H,SI1,61.9,55.0,337,4.07,4.11,2.53
8,0.22,Fair,E,VS2,65.1,61.0,337,3.87,3.78,2.49
9,0.23,Very Good,H,VS1,59.4,61.0,338,4.0,4.05,2.39


In [2]:
# Define a mapping of values to numbers
print(df['cut'].unique())
value_mapping = {'Fair': 1, 'Good': 2, 'Very Good': 3, 'Premium': 4, 'Ideal': 5}

# Map the values in the categorical column to numbers
df['cut'] = df['cut'].map(value_mapping)

['Ideal' 'Premium' 'Good' 'Very Good' 'Fair']


In [3]:
#print(df['clarity'].unique())
#print(df['color'].unique())
# OHE for the unit_id, process_id, and operator_id
df_encoded = pd.get_dummies(df, columns=['color', 'clarity'], drop_first=True)

#find One-Hot Encoded columns that contain the word color and change their type to int 
color_columns = [col for col in df_encoded.columns if 'color' in col.lower()]
df_encoded[color_columns] = df_encoded[color_columns].astype(int)

#find One-Hot Encoded columns that contain the word clarity and change their type to int 
color_columns = [col for col in df_encoded.columns if 'clarity' in col.lower()]
df_encoded[color_columns] = df_encoded[color_columns].astype(int)

df_encoded.head(50)




Unnamed: 0,carat,cut,depth,table,price,x,y,z,color_E,color_F,...,color_H,color_I,color_J,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
0,0.23,5,61.5,55.0,326,3.95,3.98,2.43,1,0,...,0,0,0,0,0,1,0,0,0,0
1,0.21,4,59.8,61.0,326,3.89,3.84,2.31,1,0,...,0,0,0,0,1,0,0,0,0,0
2,0.23,2,56.9,65.0,327,4.05,4.07,2.31,1,0,...,0,0,0,0,0,0,1,0,0,0
3,0.29,4,62.4,58.0,334,4.2,4.23,2.63,0,0,...,0,1,0,0,0,0,0,1,0,0
4,0.31,2,63.3,58.0,335,4.34,4.35,2.75,0,0,...,0,0,1,0,0,1,0,0,0,0
5,0.24,3,62.8,57.0,336,3.94,3.96,2.48,0,0,...,0,0,1,0,0,0,0,0,0,1
6,0.24,3,62.3,57.0,336,3.95,3.98,2.47,0,0,...,0,1,0,0,0,0,0,0,1,0
7,0.26,3,61.9,55.0,337,4.07,4.11,2.53,0,0,...,1,0,0,0,1,0,0,0,0,0
8,0.22,1,65.1,61.0,337,3.87,3.78,2.49,1,0,...,0,0,0,0,0,0,0,1,0,0
9,0.23,3,59.4,61.0,338,4.0,4.05,2.39,0,0,...,1,0,0,0,0,0,1,0,0,0


In [4]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Assuming 'X' contains your features and 'y' contains the target variable
# Split the data into training and testing sets
model_data = df_encoded.drop(columns=['price'])

X_train, X_test, y_train, y_test = train_test_split(model_data, df_encoded['price'], test_size=0.2, random_state=42)

# Create a KNN Regressor instance
knn_regressor = KNeighborsRegressor(n_neighbors=5)  # You can adjust the number of neighbors (k)

# Fit the model to the training data
knn_regressor.fit(X_train, y_train)

# Make predictions on the test set
predictions = knn_regressor.predict(X_test)

df_pred = pd.DataFrame({'Actual': y_test, 'Predicted': predictions})
df_pred.head(50)

Unnamed: 0,Actual,Predicted
1388,559,607.0
50052,2201,2302.6
41645,1238,1083.4
42377,1304,1128.4
17244,6901,10583.4
1608,3011,3568.0
46398,1765,1979.6
45493,1679,1829.4
49385,2102,2390.6
10460,4789,6981.2


In [28]:
mse = mean_squared_error(y_test, predictions)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, predictions)

print(f'Mean Squared Error: {mse:.2f}')
print(f'Root Mean Squared Error: {rmse:.2f}')
print(f'R-squared: {r2:.2f}')

Mean Squared Error: 916916.95
Root Mean Squared Error: 957.56
R-squared: 0.94


In [10]:
for col in df.columns: 
    print(col)
    print(type(col))
    print(f'Max: {df[col].max()}, Min: {df[col].min()}')

carat
<class 'str'>
Max: 5.01, Min: 0.2
cut
<class 'str'>
Max: Very Good, Min: Fair
color
<class 'str'>
Max: J, Min: D
clarity
<class 'str'>
Max: VVS2, Min: I1
depth
<class 'str'>
Max: 79.0, Min: 43.0
table
<class 'str'>
Max: 95.0, Min: 43.0
price
<class 'str'>
Max: 18823, Min: 326
x
<class 'str'>
Max: 10.74, Min: 0.0
y
<class 'str'>
Max: 58.9, Min: 0.0
z
<class 'str'>
Max: 31.8, Min: 0.0
