In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

# Load the dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data"
column_names = ['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model_year', 'origin', 'car_name']
df = pd.read_csv(url, delim_whitespace=True, names=column_names)

# Preprocess the data
df.drop('car_name', axis=1, inplace=True)
df['horsepower'] = pd.to_numeric(df['horsepower'], errors='coerce')
df.dropna(inplace=True)

# Split the data into features (X) and target variable (y)
X = df.drop('mpg', axis=1)
y = df['mpg']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Build and train the Gradient Boosting Regressor model
model_gb = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
model_gb.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred_gb = model_gb.predict(X_test_scaled)

# Evaluate the model
mse_gb = mean_squared_error(y_test, y_pred_gb)
print(f'Mean Squared Error (Gradient Boosting): {mse_gb}')

# Example prediction
example_input_gb = np.array([[6, 225, 100, 3233, 15.4, 76, 1]])  # Example input data for prediction
example_input_gb_scaled = scaler.transform(example_input_gb)
predicted_mpg_gb = model_gb.predict(example_input_gb_scaled)
print(f'Predicted MPG for the example input (Gradient Boosting): {predicted_mpg_gb[0]}')

# Visualize the true vs. predicted values
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred_gb, alpha=0.6)
plt.title('True vs. Predicted MPG (Gradient Boosting)')
plt.xlabel('True MPG')
plt.ylabel('Predicted MPG')
plt.show()

# Visualize the feature importance
feature_importance = model_gb.feature_importances_
feature_names = X.columns
plt.figure(figsize=(10, 6))
sns.barplot(x=feature_importance, y=feature_names)
plt.title('Feature Importance (Gradient Boosting)')
plt.xlabel('Importance')
plt.ylabel('Features')
plt.show()

Mean Squared Error (Gradient Boosting): 6.232502768358345
Predicted MPG for the example input (Gradient Boosting): 19.23407630490937


