In [None]:
import pandas as pd
from sklearn.datasets import load_boston
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from itertools import combinations
import numpy as np


In [None]:
sheet_id = "11wZvIszOgW-be-Vzs8FTop7NaJn8pNJQeqfcxr2Kh70"
sheet_name = "Sheet1"
url = f"https://docs.google.com/spreadsheets/d/{sheet_id}/gviz/tq?tqx=out:csv&sheet={sheet_name}"

table = pd.read_csv(url)

In [None]:
# Increasing the number of features

table['rcov^2'] = table['rcov'].apply(lambda x: x**2)
table['rcov^3'] = table['rcov'].apply(lambda x: x**3)
table['rcov^-1'] = table['rcov'].apply(lambda x: x**-1)
table['d_occ^2'] = table['d_occ'].apply(lambda x: x**2)
table['d_unocc^2'] = table['d_unocc'].apply(lambda x: x**2)
table['eps_d x ie'] = table['eps_d'].mul(table['IE'])
table['eps_d x q(e)'] = table['eps_d'].mul(table['q (e)'])
table['eps_d x en'] = table['eps_d'].mul(table['EN'])
table['eps_d / ie'] = table['eps_d'].div(table['IE'])
table['eps_d / q(e)'] = table['eps_d'].div(table['q (e)'])
table['eps_d / en'] = table['eps_d'].div(table['EN'])
table

Unnamed: 0,Size,Atom,G_h (eV),rcov,rd,Ef,IE,EN,eps_d,n,...,rcov^3,rcov^-1,d_occ^2,d_unocc^2,eps_d x ie,eps_d x q(e),eps_d x en,eps_d / ie,eps_d / q(e),eps_d / en
0,Periodic Graphene,Sc,-0.190,1.70,0.310,-8.282,6.561,1.36,1.064,3,...,4.913000,0.588235,0.7225,17.3889,6.980904,1.744960,1.44704,0.162170,0.648780,0.782353
1,Periodic Graphene,Ti,-0.500,1.60,0.280,-8.326,6.827,1.54,0.998,4,...,4.096000,0.625000,0.6889,12.8164,6.813346,1.401192,1.53692,0.146184,0.710826,0.648052
2,Periodic Graphene,V,-0.270,1.53,0.260,-7.494,6.746,1.63,1.316,5,...,3.581577,0.653595,1.4884,6.6049,8.877736,1.764756,2.14508,0.195079,0.981357,0.807362
3,Periodic Graphene,Cr,0.310,1.39,0.250,-6.604,6.766,1.66,0.944,6,...,2.685619,0.719424,5.5696,0.9604,6.387104,1.192272,1.56704,0.139521,0.747427,0.568675
4,Periodic Graphene,Mn,0.430,1.39,0.230,-6.520,7.434,1.55,-0.513,7,...,2.685619,0.719424,1.7689,6.0516,-3.813642,-0.656127,-0.79515,-0.069007,-0.401095,-0.330968
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94,Small nanographene,Ag,2.048,1.45,0.385,-3.331,7.576,1.93,-4.020,11,...,3.048625,0.689655,0.0009,0.2025,-30.455520,-3.107460,-7.75860,-0.530623,-5.200517,-2.082902
95,Small nanographene,Cd,0.245,1.44,0.370,-2.291,8.994,1.69,-2.120,12,...,2.985984,0.694444,0.1849,0.0001,-19.067280,-2.266280,-3.58280,-0.235713,-1.983162,-1.254438
96,Small nanographene,Hf,-1.127,1.75,0.630,-9.987,6.824,1.30,0.453,4,...,5.359375,0.571429,0.2916,4.6656,3.091272,0.711663,0.58890,0.066383,0.288351,0.348462
97,Small nanographene,Ta,-1.092,1.70,0.605,-9.573,7.887,1.50,0.120,5,...,4.913000,0.588235,1.0201,3.7636,0.946440,0.192600,0.18000,0.015215,0.074766,0.080000


In [None]:
X = table.iloc[:,3:]
y = table.iloc[:, [2]]

# Initialize an empty list to store the feature subsets
feature_subsets = []

# Loop through all possible combinations of features

for subset in combinations(range(X.shape[1]), 4):
    feature_subsets.append(list(subset))

# Sort the feature subsets by the number of features they contain
feature_subsets.sort(key=len)

# Initialize the best MSE and the corresponding feature subset
best_mse = np.inf
best_subset = pd.DataFrame({})

# Loop through all feature subsets
for subset in feature_subsets:
    # Train a linear regression model using the current subset of features
    model = LinearRegression()
    model.fit(X.iloc[:, subset], y)
    
    # Calculate the MSE of the model
    y_pred = model.predict(X.iloc[:, subset])
    mse = pow(mean_squared_error(y, y_pred),1/2)
    
    column_labels = table.columns.tolist()
    featurename = []
    for i in subset:
      featurename.append(column_labels[i+3])
    feature1 = featurename[0] 
    feature2 = featurename[1] 
    feature3 = featurename[2] 
    feature4 = featurename[3] 


    new_row = pd.DataFrame({'MSE': mse,
                            'Feature 1': feature1,
                            'Feature 2': feature2,
                            'Feature 3': feature3,
                            'Feature 4': feature4,
                            }, index=[0])
    best_subset = best_subset.append(new_row, ignore_index=True)

sorted_features = best_subset.sort_values('MSE', ascending=True)

print(sorted_features.head(10))

NameError: ignored