In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_olivetti_faces, load_breast_cancer, load_iris, fetch_california_housing
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import warnings
warnings.filterwarnings("ignore")  # Suppress warnings for cleaner output

# Function to perform linear regression and plot actual vs predicted
def run_linear_regression(X, y, dataset_name):
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Initialize and train model
    model = LinearRegression()
    model.fit(X_train, y_train)
    
    # Predict and evaluate
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    print(f"\n{dataset_name}:")
    print(f"Mean Squared Error: {mse:.4f}")
    print(f"R² Score: {r2:.4f}")
    
    # Plot actual vs predicted
    plt.figure(figsize=(8, 6))
    plt.scatter(y_test, y_pred, color='blue', alpha=0.6, label='Actual vs Predicted')
    # Plot reference line (y=x)
    min_val = min(min(y_test), min(y_pred))
    max_val = max(max(y_test), max(y_pred))
    plt.plot([min_val, max_val], [min_val, max_val], color='red', linestyle='--', label='Perfect Fit (y=x)')
    plt.xlabel(f'Actual {dataset_name}')
    plt.ylabel(f'Predicted {dataset_name}')
    plt.title(f'{dataset_name}: Actual vs Predicted (R² = {r2:.4f})')
    plt.legend()
    plt.grid(True)
    # Save plot
    plt.savefig(f'{dataset_name.replace(" ", "_").lower()}_plot.png')
    plt.show()
    
    return y_test, y_pred

# Dictionary to store results
results = {}

# 1. Olivetti Faces Dataset
faces = fetch_olivetti_faces()
X_faces = faces.data
y_faces = faces.target.astype(float)
results['Olivetti Faces'] = run_linear_regression(X_faces, y_faces, "Olivetti Faces")

# 2. Breast Cancer Dataset
cancer = load_breast_cancer()
X_cancer = cancer.data
y_cancer = X_cancer.sum(axis=1)
results['Breast Cancer'] = run_linear_regression(X_cancer, y_cancer, "Breast Cancer")

# 3. Boston Housing Dataset
data_url = "http://lib.stat.cmu.edu/datasets/boston"
raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
X_boston = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
y_boston = raw_df.values[1::2, 2]
results['Boston Housing'] = run_linear_regression(X_boston, y_boston, "Boston Housing")

# 4. Iris Dataset
iris = load_iris()
X_iris = iris.data
y_iris = iris.data[:, 0]
results['Iris'] = run_linear_regression(X_iris, y_iris, "Iris")

# 5. California Housing Dataset
california = fetch_california_housing()
X_california = california.data
y_california = california.target
results['California Housing'] = run_linear_regression(X_california, y_california, "California Housing")

# 6. Synthetic Dataset
np.random.seed(42)
X_synthetic = np.random.rand(100, 3)
y_synthetic = 2 * X_synthetic[:, 0] + 1.5 * X_synthetic[:, 1] - X_synthetic[:, 2] + np.random.normal(0, 0.1, 100)
results['Synthetic Dataset'] = run_linear_regression(X_synthetic, y_synthetic, "Synthetic Dataset")

The history saving thread hit an unexpected error (OperationalError('database or disk is full')).History will not be written to the database.
downloading Olivetti faces from https://ndownloader.figshare.com/files/5976027 to C:\Users\Ajit\scikit_learn_data


OSError: [Errno 28] No space left on device