# EE214 — Assignment 1 (Part 2)

Comparative Study of Polynomial vs Gaussian Basis Functions on the Concrete Compressive Strength Dataset

_This notebook was generated to run end-to-end. It includes fallbacks so it will not raise errors if the dataset is missing._

In [None]:

# Setup: imports and reproducibility
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
import urllib.request
import warnings
warnings.filterwarnings('ignore')
RANDOM_SEED = 0
np.random.seed(RANDOM_SEED)

print("Python version:", os.sys.version)
print("Working directory:", os.getcwd())


In [None]:

# Attempt to load the dataset from local files or download it.
data = None
local_xls = 'Concrete_Data.xls'
local_csv = 'Concrete_Data.csv'
uci_xls_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/concrete/compressive/Concrete_Data.xls'
uci_csv_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/concrete/compressive/Concrete_Data.csv'

def try_load():
    global data
    # Try local xls
    if os.path.exists(local_xls):
        print("Loading local file:", local_xls)
        try:
            data = pd.read_excel(local_xls)
            return True
        except Exception as e:
            print("Failed to read local xls:", e)
    # Try local csv
    if os.path.exists(local_csv):
        print("Loading local file:", local_csv)
        try:
            data = pd.read_csv(local_csv, header=None)
            return True
        except Exception as e:
            print("Failed to read local csv:", e)
    # Try downloading (may fail in offline environments)
    try:
        print("Trying to download from UCI (xls)...")
        urllib.request.urlretrieve(uci_xls_url, local_xls)
        data = pd.read_excel(local_xls)
        print("Downloaded and loaded", local_xls)
        return True
    except Exception as e:
        print("Could not download xls:", e)
    try:
        print("Trying to download from UCI (csv)...")
        urllib.request.urlretrieve(uci_csv_url, local_csv)
        data = pd.read_csv(local_csv, header=None)
        print("Downloaded and loaded", local_csv)
        return True
    except Exception as e:
        print("Could not download csv:", e)
    return False

loaded = try_load()
if not loaded:
    print("\nWARNING: Concrete dataset not found or download failed. Creating a synthetic dataset with same shape so notebook runs end-to-end.\n")
    # According to UCI: dataset has 8 features + 1 target, 1030 samples.
    n_samples = 1030
    n_features = 8
    X_synth = np.random.rand(n_samples, n_features) * 10
    # Create a synthetic target with some nonlinearity
    y_synth = (3.5*X_synth[:,0] - 1.2*X_synth[:,1]**2 + 2.2*np.sin(X_synth[:,2]) + np.random.randn(n_samples)*2.5)
    data = pd.DataFrame(np.column_stack((X_synth, y_synth)))
    # Name columns similar to the real dataset
    col_names = ['Cement','BlastFurnaceSlag','FlyAsh','Water','Superplasticizer','CoarseAggregate','FineAggregate','Age','Concrete_compressive_strength']
    data.columns = col_names
else:
    # If loaded from UCI excel, ensure columns are named consistently (they vary)
    if isinstance(data, pd.DataFrame) and data.shape[1] == 9:
        # If header row already present, try to standardize
        cols = data.columns.tolist()
        if 'Concrete compressive strength(MPa , megapascals) ' in cols:
            data = data.rename(columns={c: c.strip() for c in cols})
            data.columns = [c.replace('\n',' ').strip() for c in data.columns]
            # rename last column to a concise name
            data = data.rename(columns={data.columns[-1]: 'Concrete_compressive_strength'})
        else:
            # Give concise names if header is missing
            col_names = ['Cement','BlastFurnaceSlag','FlyAsh','Water','Superplasticizer','CoarseAggregate','FineAggregate','Age','Concrete_compressive_strength']
            data.columns = col_names
print("Data shape:", data.shape)
data.head()


In [None]:

# Preprocess: features, target, standardize, split
feature_cols = data.columns[:-1].tolist()
target_col = data.columns[-1]
X = data[feature_cols].values.astype(float)
y = data[target_col].values.astype(float)

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split (80/20)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=RANDOM_SEED)
print("Train shape:", X_train.shape, "Test shape:", X_test.shape)


In [None]:

# Polynomial Regression experiments
poly_degrees = [1, 3, 5, 7]
poly_train_errors = []
poly_test_errors = []

for deg in poly_degrees:
    poly = PolynomialFeatures(degree=deg, include_bias=True)
    X_train_poly = poly.fit_transform(X_train)
    X_test_poly = poly.transform(X_test)
    model = LinearRegression()
    model.fit(X_train_poly, y_train)
    y_train_pred = model.predict(X_train_poly)
    y_test_pred = model.predict(X_test_poly)
    train_mse = mean_squared_error(y_train, y_train_pred)
    test_mse = mean_squared_error(y_test, y_test_pred)
    poly_train_errors.append(train_mse)
    poly_test_errors.append(test_mse)
    print(f"Degree {deg}: train MSE={train_mse:.4f}, test MSE={test_mse:.4f}")


In [None]:

# Gaussian basis regression (use only the first feature as in skeleton)
def create_gaussian_design_matrix(x, n_bases, sigma=1.0):
    # x is (n_samples,)
    n_samples = x.shape[0]
    X_design = np.ones((n_samples, n_bases + 1))  # bias + n_bases
    for j in range(1, n_bases+1):
        mu = float(j)  # centers mu_j = j
        X_design[:, j] = np.exp(-0.5 * ((x - mu) / sigma)**2)
    return X_design

# Extract first original (standardized) feature
x_train_feature = X_train[:, 0]
x_test_feature = X_test[:, 0]

gaussian_ns = [5, 7, 10, 15]
gauss_train_errors = []
gauss_test_errors = []

for n in gaussian_ns:
    X_train_gauss = create_gaussian_design_matrix(x_train_feature, n, sigma=1.0)
    X_test_gauss = create_gaussian_design_matrix(x_test_feature, n, sigma=1.0)
    model = LinearRegression()
    model.fit(X_train_gauss, y_train)
    y_train_pred = model.predict(X_train_gauss)
    y_test_pred = model.predict(X_test_gauss)
    train_mse = mean_squared_error(y_train, y_train_pred)
    test_mse = mean_squared_error(y_test, y_test_pred)
    gauss_train_errors.append(train_mse)
    gauss_test_errors.append(test_mse)
    print(f"Gaussian bases {n}: train MSE={train_mse:.4f}, test MSE={test_mse:.4f}")


In [None]:

# Plot comparison
plt.figure(figsize=(10,6))
plt.plot(poly_degrees, poly_test_errors, 'ro-', label='Polynomial Test MSE')
plt.plot(gaussian_ns, gauss_test_errors, 'bo-', label='Gaussian Test MSE')
plt.xlabel('Model Complexity (Degree or # Gaussian Bases)')
plt.ylabel('Test Mean Squared Error')
plt.title('Polynomial vs Gaussian Basis: Test MSE Comparison')
plt.legend()
plt.grid(True)
plt.show()

# Also show training curves
plt.figure(figsize=(10,6))
plt.plot(poly_degrees, poly_train_errors, 'r--o', label='Polynomial Train MSE')
plt.plot(gaussian_ns, gauss_train_errors, 'b--o', label='Gaussian Train MSE')
plt.xlabel('Model Complexity (Degree or # Gaussian Bases)')
plt.ylabel('Train Mean Squared Error')
plt.title('Polynomial vs Gaussian Basis: Train MSE Comparison')
plt.legend()
plt.grid(True)
plt.show()


In [None]:

# Save numeric results to a CSV for easy inspection
results = pd.DataFrame({
    'poly_degree': poly_degrees,
    'poly_train_mse': poly_train_errors,
    'poly_test_mse': poly_test_errors
})
results_gauss = pd.DataFrame({
    'gauss_n': gaussian_ns,
    'gauss_train_mse': gauss_train_errors,
    'gauss_test_mse': gauss_test_errors
})
results.to_csv('/mnt/data/poly_results.csv', index=False)
results_gauss.to_csv('/mnt/data/gauss_results.csv', index=False)
print('Saved results to /mnt/data/poly_results.csv and /mnt/data/gauss_results.csv')


# Save summary figures
figpath1 = '/mnt/data/test_mse_comparison.png'
figpath2 = '/mnt/data/train_mse_comparison.png'
plt.figure(figsize=(10,6))
plt.plot(poly_degrees, poly_test_errors, 'ro-', label='Polynomial Test MSE')
plt.plot(gaussian_ns, gauss_test_errors, 'bo-', label='Gaussian Test MSE')
plt.xlabel('Model Complexity (Degree or # Gaussian Bases)')
plt.ylabel('Test Mean Squared Error')
plt.title('Polynomial vs Gaussian Basis: Test MSE Comparison')
plt.legend(); plt.grid(True)
plt.savefig(figpath1)
plt.close()

plt.figure(figsize=(10,6))
plt.plot(poly_degrees, poly_train_errors, 'r--o', label='Polynomial Train MSE')
plt.plot(gaussian_ns, gauss_train_errors, 'b--o', label='Gaussian Train MSE')
plt.xlabel('Model Complexity (Degree or # Gaussian Bases)')
plt.ylabel('Train Mean Squared Error')
plt.title('Polynomial vs Gaussian Basis: Train MSE Comparison')
plt.legend(); plt.grid(True)
plt.savefig(figpath2)
plt.close()
print(f'Saved figures to {figpath1} and {figpath2}')


## Notes

The notebook saves results and figures into `/mnt/data/`. After running it locally (or in this environment), you can download the CSVs, PNGs, and the notebook itself.