In [2]:
import pandas as pd
import warnings
import ast

warnings.filterwarnings('ignore')

In [8]:
pd.set_option('display.max_column', None)

## Load data:

In [None]:
si_data = pd.read_csv('./data/SiliconProjectBandgap_dataset.csv')
ge_data = pd.read_csv_('./data/GermaniumProjectBandgap_dataset.csv')

In [67]:
band_gap_si = si_data['band_gap'].copy()
band_gap_ge = ge_data['band_gap'].copy()

In [68]:
band_gap_si = band_gap_si.drop(['spacegroup_numbers', 'formula', 'band_gap'], axis=1)
band_gap_ge = band_gap_ge.drop(['spacegroup_numbers', 'formula', 'band_gap'], axis=1)


In [69]:
def safe_literal_eval(val):
    try:
        return ast.literal_eval(val)
    except (ValueError, SyntaxError):
        print(val)
        return val

In [80]:
sample_si = band_gap_si.applymap(safe_literal_eval)
sample_ge = band_gap_ge.applymap(safe_literal_eval)

In [92]:
def expand_df(df, cols_to_expand):
    expanded_dfs = []
    
    suffixes = ['mean', 'max', 'min', 'std']
    
    for column in cols_to_expand:

        expanded_df = pd.DataFrame(df[column].tolist())
        
        num_columns = expanded_df.shape[1]
        
        if num_columns == 4:
            expanded_df.columns = [f'{column}_{suffix}' for suffix in suffixes]
        elif num_columns == 1:
            expanded_df.columns = [column]
        else:
            expanded_df.columns = [f'{column}_{i+1}' for i in range(num_columns)]
        
        expanded_dfs.append(expanded_df)
    
    # Concatenate the original dataframe with all the expanded columns
    expanded_df_final = pd.concat([df.drop(columns=cols_to_expand)] + expanded_dfs, axis=1)
    
    return expanded_df_final


In [93]:
silicon = expand_df(sample_si, sample_si.columns)
germanium = expand_df(sample_ge, sample_ge.columns)

In [None]:
silicon.to_csv('./data/cleaned/silicon.csv')
germanium.to_csv('./data/cleaned/germanium.csv')
