In [None]:
# Import necessary libraries
import pandas as pd  # For data handling
import numpy as np  # For numerical calculations
import matplotlib.pyplot as plt  # For visualization
from sklearn.svm import SVR  # For Support Vector Regression
from sklearn.model_selection import train_test_split  # For splitting data
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error  # For evaluation metrics
from sklearn.preprocessing import StandardScaler  # For feature scaling

# Load the dataset
# For this example, we'll assume the dataset is stored in a CSV file.
# Replace the link with your actual dataset path
# df = pd.read_csv("house_prices.csv")  # Load dataset
# Example dataset for illustration (assuming the dataset has 'features' and 'target')
# df = pd.read_csv('path_to_your_dataset.csv')

# 1. Define features and target variable
# Assume columns: 'location', 'square_footage', 'bedrooms', 'price'
# X contains the independent features, and y contains the dependent target (price)
# X = df[['location', 'square_footage', 'bedrooms']]  # Features
# y = df['price']  # Target variable

# For the sake of the demonstration, we'll use a mock dataset
X = np.random.rand(100, 3)  # Mock feature matrix (100 samples, 3 features)
y = 50000 + 10000 * X[:, 0] + 5000 * X[:, 1] + 2000 * X[:, 2]  # Target variable (price)

# 2. Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 3. Preprocess the data: Scale features (important for SVM models)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)  # Fit and transform training data
X_test_scaled = scaler.transform(X_test)  # Transform test data (to avoid data leakage)

# 4. Build an SVM regression model
# Using a linear kernel for simplicity, but can be adjusted for other kernels (e.g., RBF, polynomial)
svm_regressor = SVR(kernel='linear')  # Use 'linear' kernel for simplicity
svm_regressor.fit(X_train_scaled, y_train)  # Train the model

# 5. Predictions using the trained model
y_pred = svm_regressor.predict(X_test_scaled)  # Predict on test set

# 6. Evaluate model performance using multiple regression metrics

# **Q1: Best regression metric for predicting house price**
# The best regression metric for house price prediction would typically be **Mean Squared Error (MSE)** or **R-squared (R²)**.
# Let's calculate both and evaluate:

# MSE: Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error (MSE): {mse}")

# R²: Coefficient of determination (R-squared)
r2 = r2_score(y_test, y_pred)
print(f"R-squared (R²): {r2}")

# **Q2: Decision between MSE and R-squared for predicting actual price**
# If the goal is to predict the actual price as accurately as possible, **MSE** would typically be a better choice as it penalizes large errors more than R².
# MSE will give us an understanding of how far the predicted values are from the true values.

# **Q3: Scenario with significant outliers**
# If there are significant outliers, **Mean Absolute Error (MAE)** might be a better choice than MSE as it is less sensitive to large deviations caused by outliers.
# Let's calculate MAE:
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error (MAE): {mae}")

# **Q4: Choosing between MSE and RMSE when values are close**
# RMSE (Root Mean Squared Error) and MSE are often very similar in value when they are close.
# RMSE gives the error in the same units as the target variable, while MSE gives squared units.
# If the values of MSE and RMSE are close, we may prefer **RMSE** as it is in the same units as the target (house price).

# Calculate RMSE:
rmse = np.sqrt(mse)
print(f"Root Mean Squared Error (RMSE): {rmse}")

# **Q5: Evaluate the model with different kernels (Linear, Polynomial, RBF)**
# To measure how well the model explains the variance in the target variable, we would primarily use **R-squared (R²)**.
# The R² value explains the proportion of variance in the dependent variable that is predictable from the independent variables.

# Experimenting with other kernels (e.g., Polynomial, RBF) and evaluating with R²
svm_poly = SVR(kernel='poly', degree=3)
svm_poly.fit(X_train_scaled, y_train)
y_pred_poly = svm_poly.predict(X_test_scaled)
r2_poly = r2_score(y_test, y_pred_poly)
print(f"R-squared (R²) with Polynomial Kernel: {r2_poly}")

svm_rbf = SVR(kernel='rbf')
svm_rbf.fit(X_train_scaled, y_train)
y_pred_rbf = svm_rbf.predict(X_test_scaled)
r2_rbf = r2_score(y_test, y_pred_rbf)
print(f"R-squared (R²) with RBF Kernel: {r2_rbf}")

# Conclusion:
# - For predicting house prices, MSE is often the most informative metric, though R² is also widely used for understanding model performance.
# - For datasets with outliers, MAE is often preferred.
# - RMSE should be used when comparing performance metrics in the same units as the target.
# - R-squared is the best metric to evaluate the proportion of explained variance, especially for models with different kernels.
