In [2]:
# Getting Data
from sys import path
path.append('../..')
from getTrainingData import getData

trainingData, testingData = getData()
trainingData



# Polynominal Regression

In [3]:
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt
import numpy as np

# Define target (dependent) variable
y_train = trainingData['product_price']
y_test = testingData['product_price']

# Define features (independent variables) - all columns except product_price
X_train = trainingData.drop('product_price', axis=1)
X_test = testingData.drop('product_price', axis=1)
X_test



## Creating and training the Model

In [4]:
from sklearn.decomposition import PCA

# Reduce features to 1D using Principal Component Analysis
# Using 2 components to capture more variance
pca = PCA(n_components=2)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

# Print explained variance ratio to see how much information is retained
print(f"Explained variance ratio: {pca.explained_variance_ratio_}")
print(f"Total variance explained: {sum(pca.explained_variance_ratio_):.2f}")

# Use the first principal component for the polynomial fit
condensedXTrain = X_train_pca[:, 0]
condensedXTest = X_test_pca[:, 0]

# Try a lower degree polynomial (degree 3 is often good for non-linear patterns)
poly_degree = 3
preg = np.poly1d(np.polyfit(condensedXTrain, y_train, poly_degree))

# Make predictions for plotting in the next cell
y_predictions = preg(condensedXTest)
y_predictions





## Plotting Predictions

In [5]:
plt.figure(figsize=(15, 10))

# Plot 1: Scatter plot of actual vs predicted (evaluation plot)
plt.subplot(2, 2, 1)
plt.scatter(y_test, y_predictions, color='b', alpha=0.7)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='k', linestyle='--')
plt.xlabel('Actual Price')
plt.ylabel('Predicted Price')
plt.title('Actual vs Predicted Prices')

# Plot 2: Polynomial curve fit visualization
plt.subplot(2, 2, 2)
# Sort the points for a smooth curve plot
sort_idx = np.argsort(condensedXTest)
x_sorted = condensedXTest[sort_idx]
y_actual_sorted = np.array(y_test)[sort_idx]
y_pred_sorted = y_predictions[sort_idx]

# Create smooth curve for visualization
x_curve = np.linspace(min(condensedXTest), max(condensedXTest), 100)
y_curve = preg(x_curve)

plt.scatter(condensedXTest, y_test, color='b', s=30, alpha=0.5, label='Actual data')
plt.plot(x_curve, y_curve, color='r', linewidth=2, label=f'Polynomial fit (degree {poly_degree})')
plt.xlabel('Condensed Feature (PCA)')
plt.ylabel('Price')
plt.title('Polynomial Regression Fit')
plt.legend()

# Plot 3: Visualization in 2D PCA space with color representing price
plt.subplot(2, 2, 3)
sc = plt.scatter(X_test_pca[:, 0], X_test_pca[:, 1], c=y_test, cmap='viridis', alpha=0.7)
plt.colorbar(sc, label='Price')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.title('Data Distribution in PCA Space')

# Plot 4: Prediction errors
plt.subplot(2, 2, 4)
errors = y_test - y_predictions
plt.hist(errors, bins=20, color='skyblue', edgecolor='black')
plt.axvline(x=0, color='r', linestyle='--')
plt.xlabel('Prediction Error')
plt.ylabel('Frequency')
plt.title('Distribution of Prediction Errors')

plt.tight_layout()
plt.show()

# Print the R² score to evaluate model accuracy
r2 = r2_score(y_test, y_predictions)
print(f"R² Score: {r2:.4f}")

# Calculate MAE and RMSE for additional evaluation metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error
mae = mean_absolute_error(y_test, y_predictions)
rmse = np.sqrt(mean_squared_error(y_test, y_predictions))
print(f"Mean Absolute Error: {mae:.2f}")
print(f"Root Mean Squared Error: {rmse:.2f}")





# Conclusion
Polynominal Regression is performing even worse for our application