In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Load the data
data = pd.read_csv('GPUbenchmark.csv')

# Split the data into training and validation sets
np.random.seed(1)
train_data, val_data = train_test_split(data, test_size=0.2)

# Get the feature matrix X_train and target vector y_train from the training set
X_train = train_data.iloc[:, 1:].values
y_train = train_data.iloc[:, 0].values

# Get the feature matrix X_val and target vector y_val from the validation set
X_val = val_data.iloc[:, 1:].values
y_val = val_data.iloc[:, 0].values

# Forward selection algorithm
p = X_train.shape[1]  # Number of features
selected_features = []
best_model = None
best_model_mse = float('inf')

for i in range(p):
    best_feature = None
    best_mse = float('inf')
    
    for feature in range(p):
        if feature not in selected_features:
            # Add the feature to the selected features list
            selected_features.append(feature)
            
            # Fit the model with the selected features
            model = LinearRegression()
            model.fit(X_train[:, selected_features], y_train)
            
            # Calculate mean squared error on the validation set
            y_val_pred = model.predict(X_val[:, selected_features])
            mse = mean_squared_error(y_val, y_val_pred)
            
            # Update the best feature and mse if necessary
            if mse < best_mse:
                best_feature = feature
                best_mse = mse
            
            # Remove the feature from the selected features list
            selected_features.remove(feature)
    
    # Add the best feature to the selected features list
    selected_features.append(best_feature)
    
    # Fit the model with the selected features
    model = LinearRegression()
    model.fit(X_train[:, selected_features], y_train)
    
    # Calculate mean squared error on the validation set
    y_val_pred = model.predict(X_val[:, selected_features])
    mse = mean_squared_error(y_val, y_val_pred)
    
    # Update the best model and mse if necessary
    if mse < best_model_mse:
        best_model = model
        best_model_mse = mse

# Get the names of the selected features
selected_feature_names = data.columns[1:][selected_features]

# Print the best model and selected features
print("Best Model:")
print(best_model)
print("Selected Features:")
print(selected_feature_names)


Best Model:
LinearRegression()
Selected Features:
Index(['484', '11.1', '158', '1480', '1582', '11'], dtype='object')
