In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


In [None]:
raw_data_path = os.path.join('..', 'data', 'raw', 'ring_details.xlsx')
df0 = pd.read_excel(raw_data_path)

In [None]:
df0.head()

In [None]:
df0 = df0.drop(columns = ['Image', 'Reference', 'Series:', 'Model:','Model Variant:','Category:', 'Style Type:','Range:','Stone Type:','Stone Description:','Diamond Round / Brilliant Cut - Various mm:','£ Per g'])
df0.columns

In [None]:
df0.info()

In [None]:
df0.isna().sum()

In [None]:
df0[df0['Setting Style:'].isna()]

In [None]:
df0 = df0.dropna(subset=['Setting Style:', 'Price']).reset_index(drop=True)
df0.isna().sum()

In [None]:
# Data Extraction
df0['Guide Weight:'] = df0['Guide Weight:'].str.extract(r'(\d+\.?\d*)').astype(float)
df0['Stone Size (ct):'] = df0['Stone Size (ct):'].str.extract(r'(\d+\.?\d*)').astype(float)
df0['Stone Size (mm):'] = df0['Stone Size (mm):'].replace({' mm': '', 'mm':''}, regex=True)

In [None]:
df0.info()

In [None]:
df = df0.copy()

In [None]:
df.head()

In [None]:
def reorganize_stone_size(row):
    stone_shape = row['Stone Shape:']
    stone_size = row['Stone Size (mm):']
    
    if stone_shape in ['Cushion Cut', 'Princess / Square Cut', 'Asscher Cut']:
        # Single number, interpreted as width (or length)
        return {'Width': float(stone_size), 'Length': float(stone_size)}
    elif stone_shape == 'Round / Brilliant Cut':
        # Single number, interpreted as diameter
        return {'Diameter': float(stone_size)}
    else:
        # For other shapes, split 'length x width' into two numbers
        if 'x' in stone_size:
            length, width = map(float, stone_size.split('x'))
            return {'Length': length, 'Width': width}
        else:
            # Handle cases where the format is unexpected
            return {'Length': None, 'Width': None}

# Apply the function to reorganize the 'Stone Size' column
stone_size_data = df.apply(reorganize_stone_size, axis=1)

# Convert the resulting dictionary into separate columns
stone_size_df = pd.DataFrame(stone_size_data.tolist())

# Merge the new columns back into the original DataFrame
df = pd.concat([df, stone_size_df], axis=1)

df.drop(columns=['Stone Size (mm):'], inplace=True)

# Display the updated DataFrame
df.head()

In [None]:
processed_path = os.path.join('..', 'data', 'processed', 'cleaned_ring_data.csv')
df.to_csv(processed_path, index=False)
print(f"Cleaned data saved to: {processed_path}")

In [None]:
# Data Cleaning & EDA 

# 1. Fill missing values
df['Stone Size (ct)'] = df['Stone Size (ct):'].fillna(0)


# 2. Explanatory Data Analysis EDA (展示專業性)
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
sns.histplot(df['Price'], kde=True)
plt.title('Price Distribution')

plt.subplot(1, 2, 2)
sns.scatterplot(data=df, x='Stone Size (ct)', y='Price', hue='Alloy:', alpha=0.5)
plt.title('Stone Size vs Price')

plt.figure(figsize=(10, 5))
sns.scatterplot(data=df, x='Stone Size (ct)', y='Price', hue='Alloy:', alpha=0.6)
plt.title('Relationship between Stone Carat and Price')
plt.show()

print("Data cleaning completed and EDA visualization generated.")

In [None]:
df[['Diameter', 'Length', 'Width']] = df[['Diameter', 'Length', 'Width']].fillna(0)

In [None]:
missing_data = df[(df['Diameter'].isna()) & (df['Length'].isna()) & (df['Width'].isna())]
missing_data

In [None]:
obj_cols = df.select_dtypes(include=['object']).columns
df = pd.get_dummies(df, columns=obj_cols, drop_first=True)
df.head()

In [None]:
X = df.drop(columns=['Price'])
y = df['Price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 4)
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size = 0.3, random_state = 4)

In [None]:
len(X_tr), len(X_val), len(y_tr), len(y_val), len(X_test), len(y_test)

In [None]:
models = {
    'RandomForest': RandomForestRegressor(),
    'LinearRegression': LinearRegression(),
    'Ridge': Ridge(),
    'Lasso': Lasso(),
    'SVR': SVR()}

In [None]:
from sklearn.preprocessing import StandardScaler

# Scale X_tr, X_val, and X_test separately
scaler = StandardScaler()
X_tr_scaled = scaler.fit_transform(X_tr)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)
y_tr = np.log(y_tr)
y_val = np.log(y_val)
y_test = np.log(y_test)

# Define a pipeline without the scaler
pipeline = Pipeline([
    ('model', None)  # Placeholder for the model
])

# Define parameter grid for GridSearchCV
param_grid = [
    {'model': [models['RandomForest']], 'model__n_estimators': [50, 100, 200]},
    {'model': [models['LinearRegression']]},
    {'model': [models['Ridge']], 'model__alpha': [0.1, 1.0, 10.0]},
    {'model': [models['Lasso']], 'model__alpha': [0.1, 1.0, 10.0]},
    {'model': [models['SVR']], 'model__C': [0.1, 1.0, 10.0], 'model__kernel': ['linear', 'rbf']}
]

# Perform GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1, verbose=2)

# Handle NaN values in X_tr_scaled, X_val_scaled, and X_test_scaled
X_tr_scaled = np.nan_to_num(X_tr_scaled, nan=0.0)
X_val_scaled = np.nan_to_num(X_val_scaled, nan=0.0)
X_test_scaled = np.nan_to_num(X_test_scaled, nan=0.0)

grid_search.fit(X_tr_scaled, y_tr)

# Display the best model and its parameters
print("Best Model:", grid_search.best_estimator_)
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", -grid_search.best_score_)

In [None]:
sns.set_context("talk", font_scale=0.8)

# Extract the best model from grid search
best_model = grid_search.best_estimator_.named_steps['model']

# Check if the best model has coefficients (beta values)
if hasattr(best_model, 'coef_'):
    # Get the feature names
    feature_names = X.columns
    # Get the beta coefficients
    betas = best_model.coef_
    
    # Create a DataFrame for better visualization
    beta_df = pd.DataFrame({'Feature': feature_names, 'Beta': betas})
    beta_df = beta_df.sort_values(by='Beta', ascending=False)

    # Display the beta coefficients
    print(beta_df)

    # Plot the beta coefficients
    plt.figure(figsize=(10, 6))
    sns.barplot(x='Beta', y='Feature', data=beta_df)
    plt.title('Feature Importances (Beta Coefficients)')
    plt.show()
else:
    print("The best model does not have beta coefficients.")

# For tree-based models like RandomForest, plot feature importances
if hasattr(best_model, 'feature_importances_'):
    # Get the feature importances
    importances = best_model.feature_importances_
    
    # Create a DataFrame for better visualization
    importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': importances})
    importance_df = importance_df.sort_values(by='Importance', ascending=False)

    # Display the feature importances
    importance_df

    # Plot the feature importances
    plt.figure(figsize=(10, 6))
    sns.barplot(x='Importance', y='Feature', data=importance_df)
    plt.title('Feature Importances')
    plt.show()
else:
    print("The best model does not have feature importances.")

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Initialize the Linear Regression model
linear_model = LinearRegression()

# Fit the model on the training data
linear_model.fit(X_tr_scaled, y_tr)

# Predict on the validation set
y_val_pred = linear_model.predict(X_val_scaled)

# Calculate the mean squared error and R2 score
rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
r2 = r2_score(y_val, y_val_pred)

print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R2 Score: {r2}")

In [None]:
# Plot actual vs predicted values for the linear regression model
plt.figure(figsize=(10, 6))
plt.scatter(y_val, y_val_pred, alpha=0.6, color='blue', label='Predicted vs Actual')
plt.plot([y_val.min(), y_val.max()], [y_val.min(), y_val.max()], 'k--', lw=2, color='red', label='Ideal Fit')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Linear Regression: Actual vs Predicted Values')
plt.legend()
plt.grid(True)
plt.show();

In [None]:
# Extract feature names from X
feature_names = X.columns

# Extract beta coefficients from the linear regression model
betas = linear_model.coef_

# Create a DataFrame for the beta coefficients of the linear regression model
beta_df = pd.DataFrame({
    'Feature': feature_names,
    'Beta': betas
})

# Disable scientific formatting for better readability
pd.options.display.float_format = '{:.6f}'.format

# Display the beta coefficients
beta_df = beta_df.sort_values(by='Beta', ascending=False)
beta_df



In [None]:
# Adjust font size for better readability
sns.set_context("talk", font_scale=0.8)

# Plot the beta coefficients
plt.figure(figsize=(10, 12))  # Increase figure height for better spacing
sns.barplot(x='Beta', y='Feature', data=beta_df)
plt.title('Linear Regression: Feature Importances (Beta Coefficients)')
plt.xlabel('Beta Coefficient')
plt.ylabel('Feature')
plt.grid(True)
plt.tight_layout()  # Adjust layout to prevent overlapping
plt.show()

In [None]:
# Plot feature importances
plt.figure(figsize=(12, 8))
sns.barplot(x='Importance', y='Feature', data=importance_df.sort_values(by='Importance', ascending=False).head(20))
plt.title('Top 20 Feature Importances')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.tight_layout()
plt.show();

In [None]:
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Predictions for RandomForestRegressor on the validation set
y_val_pred_rf = best_model.predict(X_val_scaled)

# Calculate RMSE and R² for RandomForestRegressor
rmse_rf = np.sqrt(mean_squared_error(y_val, y_val_pred_rf))
r2_rf = r2_score(y_val, y_val_pred_rf)

# Calculate RMSE and R² for LinearRegression (recompute if necessary)
rmse_lr = np.sqrt(mean_squared_error(y_val, y_val_pred))  # Recompute RMSE for LinearRegression
r2_lr = r2  # r2 is already defined for LinearRegression

# Differences
rmse_diff = rmse_lr - rmse_rf
r2_diff = r2_lr - r2_rf

print(f"RMSE Difference (LinearRegression - RandomForest): {rmse_diff}")
print(f"R² Difference (LinearRegression - RandomForest): {r2_diff}")

In [None]:
# Assuming `feature_names` contains the names of the features and `betas` contains the coefficients
intercept = linear_model.intercept_  # Intercept of the model
formula = f"Logistic Regression Formula: logit(P) = {intercept:.4f}"

for feature, beta in zip(feature_names, betas):
    formula += f" + ({beta:.4f} * {feature})"

print(formula)

In [None]:
# Create a DataFrame for the beta coefficients
beta_df = pd.DataFrame({
    'Feature': feature_names,
    'Beta': betas
})

# Sort the DataFrame by the absolute value of the beta coefficients for better visualization
beta_df = beta_df.reindex(beta_df['Beta'].sort_values(ascending=False).index)

# Display the DataFrame
beta_df

In [None]:
# Test the model on the test set
y_test_pred = linear_model.predict(X_test_scaled)
# Calculate RMSE and R² for the test set
rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))
r2_test = r2_score(y_test, y_test_pred)
# Calculate the differences
rmse_diff = rmse_lr - rmse_test
r2_diff = r2_lr - r2_test

# Create a DataFrame to display the R2 and RMSE metrics
metrics_df = pd.DataFrame({
    'Metric': ['Validation', 'Test', 'Difference'],
    'R2': [r2, r2_test, r2_diff],
    'RMSE': [rmse, rmse_test, rmse_diff]
})

# Display the DataFrame
metrics_df
