In [53]:
import sqlite3
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder

# Read the diamonds_train.db file
conn = sqlite3.connect('diamonds_train.db')

# Fetch the required tables
df_dim = pd.read_sql_query('SELECT * FROM diamonds_dimensions', conn)
df_trans = pd.read_sql_query('SELECT * FROM diamonds_transactional', conn)
df_prop = pd.read_sql_query('SELECT * FROM diamonds_properties', conn)
df_cut = pd.read_sql_query('SELECT * FROM diamonds_cut', conn)
df_color = pd.read_sql_query('SELECT * FROM diamonds_color', conn)
df_clarity = pd.read_sql_query('SELECT * FROM diamonds_clarity', conn)
df_city = pd.read_sql_query('SELECT * FROM diamonds_city', conn)

# Merge the tables based on index_id
df = pd.merge(df_dim, df_trans, on='index_id')
df = pd.merge(df, df_prop, on='index_id')
df = pd.merge(df, df_cut, on='cut_id')
df = pd.merge(df, df_color, on='color_id')
df = pd.merge(df, df_clarity, on='clarity_id')
df = pd.merge(df, df_city, on='city_id')

# Create X and y
X = df[['carat', 'cut', 'color', 'clarity', 'x', 'y', 'z', 'depth', 'table', 'city']]
y = df['price']

# Convert non-numeric columns to numeric using OrdinalEncoder
ordinal_features = ['cut', 'color', 'clarity', 'city']
ordinal_encoder = OrdinalEncoder()
X[ordinal_features] = ordinal_encoder.fit_transform(X[ordinal_features])

# Convert 'city' column to string type
X['city'] = X['city'].astype(str)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[ordinal_features] = ordinal_encoder.fit_transform(X[ordinal_features])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['city'] = X['city'].astype(str)


In [54]:
# Scale numerical features
numeric_features = ['carat', 'x', 'y', 'z', 'depth', 'table']
scaler = StandardScaler()
X[numeric_features] = scaler.fit_transform(X[numeric_features])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Pipeline for preprocessing and modeling
preprocessing = ColumnTransformer(
    [('vectorizer', CountVectorizer(), 'city')],
    remainder='passthrough'
)

model = Pipeline([
    ('preprocessing', preprocessing),
    ('feature_selection', SelectKBest(score_func=f_regression)),
    ('regressor', RandomForestRegressor())
])

# GridSearchCV for hyperparameter tuning
param_grid = {
    'feature_selection__k': [5, 7, 10],
    'regressor__n_estimators': [100, 200],
    'regressor__max_depth': [None, 10, 20]
}

grid_search = GridSearchCV(model, param_grid, scoring='neg_root_mean_squared_error', cv=5)
grid_search.fit(X_train, y_train)

# Best model from GridSearchCV
best_model = grid_search.best_estimator_

# Evaluate on test set
y_pred = best_model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("Root Mean Squared Error:", rmse)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[numeric_features] = scaler.fit_transform(X[numeric_features])


Root Mean Squared Error: 593.9563115316485


In [55]:
diamantes_test = pd.read_csv('diamonds_test.csv')
diamantes_test

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,city
0,0,0.79,Very Good,F,SI1,62.7,60.0,5.82,5.89,3.67,Amsterdam
1,1,1.20,Ideal,J,VS1,61.0,57.0,6.81,6.89,4.18,Surat
2,2,1.57,Premium,H,SI1,62.2,61.0,7.38,7.32,4.57,Kimberly
3,3,0.90,Very Good,F,SI1,63.8,54.0,6.09,6.13,3.90,Kimberly
4,4,0.50,Very Good,F,VS1,62.9,58.0,5.05,5.09,3.19,Amsterdam
...,...,...,...,...,...,...,...,...,...,...,...
13480,13480,0.57,Ideal,E,SI1,61.9,56.0,5.35,5.32,3.30,Amsterdam
13481,13481,0.71,Ideal,I,VS2,62.2,55.0,5.71,5.73,3.56,New York City
13482,13482,0.70,Ideal,F,VS1,61.6,55.0,5.75,5.71,3.53,Tel Aviv
13483,13483,0.70,Very Good,F,SI2,58.8,57.0,5.85,5.89,3.45,Surat


In [56]:
# Check for null values in the dataset
print(diamantes_test.isnull().sum())

# Drop rows with null values
diamantes_test.dropna(inplace=True)

# Remove commas and convert numeric columns to float
numeric_cols = ['carat', 'x', 'y', 'z', 'depth', 'table']
for col in numeric_cols:
    diamantes_test[col] = diamantes_test[col].astype(str).str.replace(',', '').astype(float)

# Reorder the columns to match the order used during training
columns_order = ['carat', 'cut', 'color', 'clarity', 'x', 'y', 'z', 'depth', 'table', 'city']
diamantes_test = diamantes_test[columns_order]

id         0
carat      0
cut        0
color      0
clarity    0
depth      0
table      0
x          0
y          0
z          0
city       0
dtype: int64


In [57]:
# Encode categorical variables

from sklearn.preprocessing import OrdinalEncoder

# Initialize the encoders
encoder_cut = OrdinalEncoder()
encoder_color = OrdinalEncoder()
encoder_clarity = OrdinalEncoder()
encoder_city = OrdinalEncoder()

diamantes_test.loc[:, 'cut'] = encoder_cut.fit_transform(diamantes_test[['cut']])
diamantes_test.loc[:, 'color'] = encoder_color.fit_transform(diamantes_test[['color']])
diamantes_test.loc[:, 'clarity'] = encoder_clarity.fit_transform(diamantes_test[['clarity']])
diamantes_test.loc[:, 'city'] = encoder_city.fit_transform(diamantes_test[['city']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  diamantes_test.loc[:, 'cut'] = encoder_cut.fit_transform(diamantes_test[['cut']])
  diamantes_test.loc[:, 'cut'] = encoder_cut.fit_transform(diamantes_test[['cut']])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  diamantes_test.loc[:, 'color'] = encoder_color.fit_transform(diamantes_test[['color']])
  diamantes_test.loc[:, 'color'] = encoder_color.fit_transform(diamantes_test[['color']])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

Se

In [58]:
diamantes_test.columns

Index(['carat', 'cut', 'color', 'clarity', 'x', 'y', 'z', 'depth', 'table',
       'city'],
      dtype='object')

In [69]:
diamantes_test = diamantes_test[['carat', 'cut', 'color', 'clarity', 'x', 'y', 'z', 'depth', 'table','city']]

In [70]:
# Scale numerical features
scaler = StandardScaler()
diamantes_test[['carat', 'x', 'y', 'z', 'depth', 'table']] = scaler.fit_transform(diamantes_test[['carat', 'x', 'y', 'z', 'depth', 'table']])

In [86]:
# Perform one-hot encoding on the 'cut' column in the test dataset
diamantes_test_encoded = pd.get_dummies(diamantes_test, columns=['cut'])

# Ensure the columns in the test dataset match the columns used during training
missing_cols = set(X_train.columns) - set(diamantes_test_encoded.columns)
for col in missing_cols:
    diamantes_test_encoded[col] = 0

# Reorder columns to match the training dataset
diamantes_test_encoded = diamantes_test_encoded[X_train.columns]

# Scale numerical features
diamantes_test_encoded[['carat', 'x', 'y', 'z', 'depth', 'table']] = scaler.transform(diamantes_test_encoded[['carat', 'x', 'y', 'z', 'depth', 'table']])

# Make predictions
y_final = model.predict(diamantes_test_encoded)

ValueError: could not convert string to float: 'F'

In [73]:
y_final = model.predict(diamantes_test)
y_final

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- cut_Fair
- cut_Good
- cut_Ideal
- cut_Premium
- cut_Very Good
Feature names seen at fit time, yet now missing:
- cut


In [49]:
submission.to_csv("./submi_test02.csv", index=False)