In [2]:
import pandas as pd
import warnings
import ast

warnings.filterwarnings('ignore')

In [8]:
pd.set_option('display.max_column', None)

## Silicon:

In [20]:
data = pd.read_csv('SiliconProjectBandgap_dataset.csv')

In [4]:
data.shape

(5000, 29)

In [14]:
data.columns

Index(['atomic_numbers', 'volume_per_atom', 'empty_volume', 'metals_fraction',
       'distance_matrix', 'van_der_waals_radius', 'electrical_resistivity',
       'velocity_of_sound', 'reflectivity', 'poissons_ratio', 'molar_volume',
       'thermal_conductivity', 'melting_point', 'critical_temperature',
       'superconduction_temperature', 'liquid_range', 'bulk_modulus',
       'youngs_modulus', 'brinell_hardness', 'rigidity_modulus',
       'vickers_hardness', 'density_of_solid',
       'coefficient_of_linear_thermal_expansion', 'average_ionic_radius',
       'average_cationic_radius', 'average_anionic_radius',
       'spacegroup_numbers', 'formula', 'band_gap'],
      dtype='object')

In [66]:
df = data.copy()


In [67]:
band_gap = df['band_gap'].copy()

In [68]:
df = df.drop(['spacegroup_numbers', 'formula', 'band_gap'], axis=1)

In [69]:
def safe_literal_eval(val):
    try:
        return ast.literal_eval(val)
    except (ValueError, SyntaxError):
        print(val)
        return val

In [80]:
sample = df.applymap(safe_literal_eval)

In [92]:
def expand_df(df, cols_to_expand):
    expanded_dfs = []
    
    suffixes = ['mean', 'max', 'min', 'std']
    
    for column in cols_to_expand:

        expanded_df = pd.DataFrame(df[column].tolist())
        
        num_columns = expanded_df.shape[1]
        
        if num_columns == 4:
            expanded_df.columns = [f'{column}_{suffix}' for suffix in suffixes]
        elif num_columns == 1:
            expanded_df.columns = [column]
        else:
            expanded_df.columns = [f'{column}_{i+1}' for i in range(num_columns)]
        
        expanded_dfs.append(expanded_df)
    
    # Concatenate the original dataframe with all the expanded columns
    expanded_df_final = pd.concat([df.drop(columns=cols_to_expand)] + expanded_dfs, axis=1)
    
    return expanded_df_final


In [93]:
X = expand_df_(sample, sample.columns)

In [94]:
X.shape

(5000, 95)

In [95]:
X.columns

Index(['atomic_numbers_mean', 'atomic_numbers_max', 'atomic_numbers_min',
       'atomic_numbers_std', 'volume_per_atom', 'empty_volume',
       'metals_fraction', 'distance_matrix_mean', 'distance_matrix_max',
       'distance_matrix_min', 'distance_matrix_std',
       'van_der_waals_radius_mean', 'van_der_waals_radius_max',
       'van_der_waals_radius_min', 'van_der_waals_radius_std',
       'electrical_resistivity_mean', 'electrical_resistivity_max',
       'electrical_resistivity_min', 'electrical_resistivity_std',
       'velocity_of_sound_mean', 'velocity_of_sound_max',
       'velocity_of_sound_min', 'velocity_of_sound_std', 'reflectivity_mean',
       'reflectivity_max', 'reflectivity_min', 'reflectivity_std',
       'poissons_ratio_mean', 'poissons_ratio_max', 'poissons_ratio_min',
       'poissons_ratio_std', 'molar_volume_mean', 'molar_volume_max',
       'molar_volume_min', 'molar_volume_std', 'thermal_conductivity_mean',
       'thermal_conductivity_max', 'thermal_conduct

In [96]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 95 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   atomic_numbers_mean                           5000 non-null   float64
 1   atomic_numbers_max                            5000 non-null   int64  
 2   atomic_numbers_min                            5000 non-null   int64  
 3   atomic_numbers_std                            5000 non-null   float64
 4   volume_per_atom                               5000 non-null   float64
 5   empty_volume                                  5000 non-null   float64
 6   metals_fraction                               5000 non-null   float64
 7   distance_matrix_mean                          5000 non-null   float64
 8   distance_matrix_max                           5000 non-null   float64
 9   distance_matrix_min                           5000 non-null   f