In [1]:
import pandas as pd
import warnings
import ast

warnings.filterwarnings('ignore')

In [2]:
pd.set_option('display.max_column', None)

## Load data:

In [3]:
si_data = pd.read_csv('../data/SiliconProjectBandgap_dataset.csv')
ge_data = pd.read_csv('../data/GermaniumProjectBandgap_dataset.csv')

In [4]:
band_gap_si = si_data['band_gap'].copy()
band_gap_ge = ge_data['band_gap'].copy()

In [5]:
si_df = si_data.drop(['spacegroup_numbers', 'formula', 'band_gap'], axis=1)
ge_df = ge_data.drop(['spacegroup_numbers', 'formula', 'band_gap'], axis =1)


In [6]:
def safe_literal_eval(val):
    try:
        return ast.literal_eval(val)
    except (ValueError, SyntaxError):
        print(val)
        return val

In [7]:
sample_si = si_df.applymap(safe_literal_eval)
sample_ge = ge_df.applymap(safe_literal_eval)

In [8]:
def expand_df(df, cols_to_expand):
    expanded_dfs = []
    
    suffixes = ['mean', 'max', 'min', 'std']
    
    for column in cols_to_expand:

        expanded_df = pd.DataFrame(df[column].tolist())
        
        num_columns = expanded_df.shape[1]
        
        if num_columns == 4:
            expanded_df.columns = [f'{column}_{suffix}' for suffix in suffixes]
        elif num_columns == 1:
            expanded_df.columns = [column]
        else:
            expanded_df.columns = [f'{column}_{i+1}' for i in range(num_columns)]
        
        expanded_dfs.append(expanded_df)
    
    # Concatenate the original dataframe with all the expanded columns
    expanded_df_final = pd.concat([df.drop(columns=cols_to_expand)] + expanded_dfs, axis=1)
    
    return expanded_df_final


In [9]:
silicon = expand_df(sample_si, sample_si.columns)
germanium = expand_df(sample_ge, sample_ge.columns)

In [11]:
import os

In [12]:
directory = "../data/cleaned"

os.makedirs(directory, exist_ok=True)


In [14]:
silicon.to_csv('../data/cleaned/silicon.csv', index=False)
germanium.to_csv('../data/cleaned/germanium.csv', index=False)


In [15]:
band_gap_si.to_csv('../data/cleaned/bandgapSi.csv', index=False)
band_gap_ge.to_csv('../data/cleaned/bandgapGe.csv', index=False)
