In [2]:
import sqlite3
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Read the diamonds_train.db file
conn = sqlite3.connect('diamonds_train.db')

# Fetch the required tables
df_dim = pd.read_sql_query('SELECT * FROM diamonds_dimensions', conn)
df_trans = pd.read_sql_query('SELECT * FROM diamonds_transactional', conn)
df_prop = pd.read_sql_query('SELECT * FROM diamonds_properties', conn)
df_cut = pd.read_sql_query('SELECT * FROM diamonds_cut', conn)
df_color = pd.read_sql_query('SELECT * FROM diamonds_color', conn)
df_clarity = pd.read_sql_query('SELECT * FROM diamonds_clarity', conn)
df_city = pd.read_sql_query('SELECT * FROM diamonds_city', conn)

# Merge the tables based on index_id
df = pd.merge(df_dim, df_trans, on='index_id')
df = pd.merge(df, df_prop, on='index_id')
df = pd.merge(df, df_cut, on='cut_id')
df = pd.merge(df, df_color, on='color_id')
df = pd.merge(df, df_clarity, on='clarity_id')
df = pd.merge(df, df_city, on='city_id')

# Check for null values in the dataset
print(df.isnull().sum())

# Fill missing values with mean
df.fillna(df.select_dtypes(include='number').mean(), inplace=True)

# Split into X and y
X = df[['carat', 'cut', 'color', 'clarity', 'x', 'y', 'z', 'depth', 'table', 'city']]
y = df['price']

# Encode categorical variables
encoder = OneHotEncoder(sparse=False)
X_encoded = encoder.fit_transform(X[['cut', 'color', 'clarity', 'city']])

# Get column names for encoded features
encoded_cols = encoder.get_feature_names_out(['cut', 'color', 'clarity', 'city'])
X_encoded_df = pd.DataFrame(X_encoded, columns=encoded_cols)

# Scale numerical features
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X[['carat', 'x', 'y', 'z', 'depth', 'table']]))
X_scaled.columns = ['carat', 'x', 'y', 'z', 'depth', 'table']

# Concatenate encoded categorical variables and scaled numerical features
X_processed = pd.concat([X_encoded_df, X_scaled], axis=1)

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)

# Train the model
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

# Evaluate the model
rmse = mean_squared_error(y_test, model.predict(X_test), squared=False)
print("RMSE:", rmse)

index_id      0
depth         0
table         0
x             0
y             0
z             0
price         0
city_id       0
carat         0
cut_id        0
color_id      0
clarity_id    0
cut           0
color         0
clarity       0
city          0
dtype: int64




RMSE: 589.8332466739099


In [3]:
diamantes_test = pd.read_csv('diamonds_test.csv')
diamantes_test

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,city
0,0,0.79,Very Good,F,SI1,62.7,60.0,5.82,5.89,3.67,Amsterdam
1,1,1.20,Ideal,J,VS1,61.0,57.0,6.81,6.89,4.18,Surat
2,2,1.57,Premium,H,SI1,62.2,61.0,7.38,7.32,4.57,Kimberly
3,3,0.90,Very Good,F,SI1,63.8,54.0,6.09,6.13,3.90,Kimberly
4,4,0.50,Very Good,F,VS1,62.9,58.0,5.05,5.09,3.19,Amsterdam
...,...,...,...,...,...,...,...,...,...,...,...
13480,13480,0.57,Ideal,E,SI1,61.9,56.0,5.35,5.32,3.30,Amsterdam
13481,13481,0.71,Ideal,I,VS2,62.2,55.0,5.71,5.73,3.56,New York City
13482,13482,0.70,Ideal,F,VS1,61.6,55.0,5.75,5.71,3.53,Tel Aviv
13483,13483,0.70,Very Good,F,SI2,58.8,57.0,5.85,5.89,3.45,Surat


In [4]:
# Check for null values in the dataset
print(diamantes_test.isnull().sum())

# Drop rows with null values
diamantes_test.dropna(inplace=True)

# Remove commas and convert numeric columns to float
numeric_cols = ['carat', 'x', 'y', 'z', 'depth', 'table']
for col in numeric_cols:
    diamantes_test[col] = diamantes_test[col].astype(str).str.replace(',', '').astype(float)

# Reorder the columns to match the order used during training
columns_order = ['carat', 'cut', 'color', 'clarity', 'x', 'y', 'z', 'depth', 'table', 'city']
diamantes_test = diamantes_test[columns_order]

id         0
carat      0
cut        0
color      0
clarity    0
depth      0
table      0
x          0
y          0
z          0
city       0
dtype: int64


In [5]:
X_test.columns

Index(['cut_Fair', 'cut_Good', 'cut_Ideal', 'cut_Premium', 'cut_Very Good',
       'color_D', 'color_E', 'color_F', 'color_G', 'color_H', 'color_I',
       'color_J', 'clarity_I1', 'clarity_IF', 'clarity_SI1', 'clarity_SI2',
       'clarity_VS1', 'clarity_VS2', 'clarity_VVS1', 'clarity_VVS2',
       'city_Amsterdam', 'city_Antwerp', 'city_Dubai', 'city_Kimberly',
       'city_Las Vegas', 'city_London', 'city_Luxembourg', 'city_Madrid',
       'city_New York City', 'city_Paris', 'city_Surat', 'city_Tel Aviv',
       'city_Zurich', 'carat', 'x', 'y', 'z', 'depth', 'table'],
      dtype='object')

In [6]:
diamantes_test.columns

Index(['carat', 'cut', 'color', 'clarity', 'x', 'y', 'z', 'depth', 'table',
       'city'],
      dtype='object')

In [9]:
# Scale numerical features
scaler = StandardScaler()
diamantes_test[['carat', 'x', 'y', 'z', 'depth', 'table']] = scaler.fit_transform(diamantes_test[['carat', 'x', 'y', 'z', 'depth', 'table']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  diamantes_test[['carat', 'x', 'y', 'z', 'depth', 'table']] = scaler.fit_transform(diamantes_test[['carat', 'x', 'y', 'z', 'depth', 'table']])


In [10]:
# Encode categorical variables in the testing data
X_test_encoded = encoder.transform(X_test[['cut', 'color', 'clarity', 'city']])

# Get column names for encoded features in the testing data
encoded_cols_test = encoder.get_feature_names_out(['cut', 'color', 'clarity', 'city'])
X_test_encoded_df = pd.DataFrame(X_test_encoded, columns=encoded_cols_test)

# Concatenate encoded categorical variables and scaled numerical features for testing data
X_test_processed = pd.concat([X_test_encoded_df, X_test[['carat', 'x', 'y', 'z', 'depth', 'table']]], axis=1)

# Evaluate the model with the modified testing data
rmse = mean_squared_error(y_test, model.predict(X_test_processed), squared=False)
print("RMSE:", rmse)

KeyError: "None of [Index(['cut', 'color', 'clarity', 'city'], dtype='object')] are in the [columns]"

In [None]:
y_final = model.predict(diamantes_test)
y_final