In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from scipy.optimize import minimize
np.set_printoptions(precision=8, suppress=True)

# Ordinary Least Squares (Using Cross Validation)

In [2]:
data = pd.read_csv('C:/Users/91959/Desktop/CODE'
                '/Robust-Penalized-Empirical-Likelihood-Estimation-Method-for-Linear-Regression/Data/Alcohol.csv')

# Independent variables (features) - all columns except the first (Alcohol) and last (ln (Sol)exp)
X = data.iloc[:, 1:-1].values  # Exclude the first column (Alcohol) and the last column (ln (Sol)exp)
# Dependent variable (target) - last column (ln (Sol)exp)
y = data.iloc[:, -1].values

In [3]:
# Set up k-fold cross-validation
k = 5  # number of folds
kf = KFold(n_splits=k, shuffle=True, random_state=42)

# Lists to store results OLS
ols_cv_betas = []
ols_cv_r2_scores = []

In [4]:
# Perform cross-validation
for fold, (train_idx, test_idx) in enumerate(kf.split(X), 1):
    # Split data for this fold
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    # Add bias term to training data
    X_train_bias = np.column_stack((np.ones(X_train.shape[0]), X_train))

    # Calculate beta using normal equation
    XT = X_train_bias.T
    XT_X = np.dot(XT, X_train_bias)
    XT_X_inv = np.linalg.inv(XT_X)
    XT_y = np.dot(XT, y_train)
    beta = np.dot(XT_X_inv, XT_y)

    # Add bias term to test data
    X_test_bias = np.column_stack((np.ones(X_test.shape[0]), X_test))

    # Make predictions
    y_pred = np.dot(X_test_bias, beta)

    # Calculate R-squared
    y_test_mean = np.mean(y_test)
    SS_res = np.sum((y_test - y_pred) ** 2)
    SS_tot = np.sum((y_test - y_test_mean) ** 2)
    r2 = 1 - (SS_res / SS_tot)

    # Store results
    ols_cv_betas.append(beta)
    ols_cv_r2_scores.append(r2)

In [5]:
# Calculate overall mean and standard deviation of results
ols_mean_beta = np.mean(ols_cv_betas, axis=0)
ols_std_beta = np.std(ols_cv_betas, axis=0)
ols_mean_r2 = np.mean(ols_cv_r2_scores)
ols_std_r2 = np.std(ols_cv_r2_scores)

print("\nOverall Results [OLS]:")
print(f"Mean R-squared: {ols_mean_r2:.4f} (±{ols_std_r2:.4f})")
print("\nMean Beta Coefficients:")
feature_names = ['Intercept'] + list(data.columns[1:-1])
for name, beta_mean, beta_std in zip(feature_names, ols_mean_beta, ols_std_beta):
    print(f"{name}: {beta_mean:.4f} (±{beta_std:.4f})")


Overall Results [OLS]:
Mean R-squared: 0.9275 (±0.0373)

Mean Beta Coefficients:
Intercept: 29.1271 (±25.9525)
SAG: -0.0047 (±0.0596)
V: -0.0164 (±0.0542)
Log P: -2.4590 (±0.7336)
P: 27.1147 (±26.0920)
RM: -1.0957 (±1.3066)
Mass: -3.1410 (±3.5617)


Intercept: 29.1271 (±25.9525) - This wide range suggests the OLS model is not very stable across different subsets of data.

# Maximum Likelihood Estimation (Using Cross Validation)

In [6]:
# Set up k-fold cross-validation
k = 5  # number of folds
kf2 = KFold(n_splits=k, shuffle=True, random_state=44)

# Lists to store results MLE
mle_cv_betas = []
mle_cv_r2_scores = []

In [7]:
# Perform cross-validation
for fold, (train_idx, test_idx) in enumerate(kf2.split(X), 1):
    # Split data for this fold
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    # Add bias term to training data
    X_train_bias = np.column_stack((np.ones(X_train.shape[0]), X_train))

    # Get initial guess using OLS
    ols_beta = np.linalg.inv(X_train_bias.T @ X_train_bias) @ X_train_bias.T @ y_train
    initial_sigma = np.std(y_train - X_train_bias @ ols_beta)
    initial_params = np.concatenate([ols_beta, [np.log(initial_sigma)]])

    # Define the objective function (negative log likelihood)
    def objective(params):
        beta = params[:-1]
        sigma = np.exp(params[-1])
        y_pred = X_train_bias @ beta
        n = len(y_train)
        return n/2 * np.log(2 * np.pi) + n * np.log(sigma) + np.sum((y_train - y_pred)**2) / (2 * sigma**2)

    # Minimize negative log likelihood
    result = minimize(objective, initial_params, method='BFGS')

    # Extract parameters
    beta = result.x[:-1]
    sigma = np.exp(result.x[-1])

    # Add bias term to test data
    X_test_bias = np.column_stack((np.ones(X_test.shape[0]), X_test))

    # Make predictions
    y_pred = X_test_bias @ beta

    # Calculate R-squared
    y_test_mean = np.mean(y_test)
    SS_res = np.sum((y_test - y_pred) ** 2)
    SS_tot = np.sum((y_test - y_test_mean) ** 2)
    r2 = 1 - (SS_res / SS_tot)

    # Store results
    mle_cv_betas.append(beta)
    mle_cv_r2_scores.append(r2)

In [8]:
# Calculate overall mean and standard deviation of results
mle_mean_beta = np.mean(mle_cv_betas, axis=0)
mle_std_beta = np.std(mle_cv_betas, axis=0)
mle_mean_r2 = np.mean(mle_cv_r2_scores)
mle_std_r2 = np.std(mle_cv_r2_scores)

print("\nOverall Results [MLE]:")
print(f"Mean R-squared: {mle_mean_r2:.4f} (±{mle_std_r2:.4f})")
print("\nMean Beta Coefficients:")
feature_names = ['Intercept'] + list(data.columns[1:-1])
for name, beta_mean, beta_std in zip(feature_names, mle_mean_beta, mle_std_beta):
    print(f"{name}: {beta_mean:.4f} (±{beta_std:.4f})")


Overall Results [MLE]:
Mean R-squared: 0.8417 (±0.2565)

Mean Beta Coefficients:
Intercept: 29.8309 (±25.2362)
SAG: 0.0162 (±0.0248)
V: -0.0307 (±0.0209)
Log P: -2.6767 (±0.8164)
P: 29.6235 (±21.0984)
RM: -1.3026 (±1.6532)
Mass: -3.3857 (±3.2111)


---