In [1]:
"""
Enhanced Feature Engineering for Superconductor Data
====================================================
Author: Ankita Biswas
Project: Dashboard for Superconductors
Date: December 2025

This script improves upon the Kaggle code by:
1. Using our pre-cleaned, high-quality data
2. Better formula normalization handling
3. Enhanced material classification
4. More robust feature generation
5. Proper handling of oxygen variability
"""



### Import Packages

In [2]:
import pandas as pd
import numpy as np
import ast
import warnings
warnings.filterwarnings('ignore')

### Configuration

In [3]:
INPUT_FILE = '/home/digifort/Documents/Data_Management_F25/supercon/clean_data/superconductors_tier2_standard.csv'  # Our cleaned data
OUTPUT_DIR = '/home/digifort/Documents/Data_Management_F25/supercon/feature_engineered_data/'
OUTPUT_FILE = '/home/digifort/Documents/Data_Management_F25/supercon/feature_engineered_data/superconductors_with_features.csv'

print("=" * 70)
print("SUPERCONDUCTOR FEATURE ENGINEERING PIPELINE")
print("=" * 70)

SUPERCONDUCTOR FEATURE ENGINEERING PIPELINE


### Load data

In [4]:
print("\n[1/7] Loading cleaned data...")
df = pd.read_csv(INPUT_FILE)
print(f"Loaded {len(df)} records")

# Convert string representation of sets back to actual sets
df['elements'] = df['elements'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else set())

print(f"\nInitial columns ({len(df.columns)}): {list(df.columns)}")


[1/7] Loading cleaned data...
Loaded 26248 records

Initial columns (24): ['data_number', 'common_formula', 'chemical_formula', 'structure_name', 'tc_unit', 'tc_value', 'journal_reference', 'missing_formula', 'missing_tc', 'missing_critical', 'formula_normalized', 'has_oxygen_var', 'elements', 'n_elements', 'tc_kelvin', 'tc_has_uncertainty', 'tc_validation', 'tc_validation_reason', 'is_duplicate_formula', 'is_duplicate_exact', 'quality_tier', 'publication_year', 'is_high_tc', 'material_family']


In [5]:
print("\n[2/7] Selecting relevant columns...")

# Keep only columns needed for feature engineering and analysis
columns_to_keep = [
    # Core data
    'data_number',           # Keep for reference
    'chemical_formula',      # Original formula
    'formula_normalized',    # Cleaned formula
    'tc_kelvin',            # Target variable (Tc)
    
    # Composition info
    'elements',             # Set of elements
    'n_elements',           # Number of elements
    
    # Quality flags (useful for filtering later)
    'quality_tier',         # Quality classification
    'has_oxygen_var',       # Oxygen variability flag
    'is_duplicate_formula', # Duplicate flag
    'is_high_tc',          # High-Tc flag
    
    # Material info
    'material_family',      # Cuprate/iron-based/etc.
    'publication_year',     # Discovery year
    
    # Validation info
    'tc_validation',        # Valid/too_low/too_high
]

# Columns to DROP (not needed for feature engineering)
columns_to_drop = [
    'common_formula',       # Redundant with chemical_formula
    'structure_name',       # Not used in features
    'tc_unit',             # Always Kelvin
    'tc_value',            # Replaced by tc_kelvin
    'journal_reference',    # Not used in features
    'missing_formula',      # All False in tier2
    'missing_tc',          # All False in tier2
    'missing_critical',    # All False in tier2
    'tc_has_uncertainty',  # Keep only tc_kelvin
    'tc_validation_reason', # Redundant with tc_validation
    'is_duplicate_exact',  # Keep only is_duplicate_formula
]


[2/7] Selecting relevant columns...


In [6]:
df = df[columns_to_keep]
print(f"Kept {len(columns_to_keep)} relevant columns")
print(f"Dropped {len(columns_to_drop)} redundant columns")

Kept 13 relevant columns
Dropped 11 redundant columns


In [7]:
df

Unnamed: 0,data_number,chemical_formula,formula_normalized,tc_kelvin,elements,n_elements,quality_tier,has_oxygen_var,is_duplicate_formula,is_high_tc,material_family,publication_year,tc_validation
0,3,Ba0.1La1.9Ag0.1Cu0.9O4-Y,Ba0.1La1.9Ag0.1Cu0.9O4-Y,26.00,"{La, Cu, Ba, Ag, O}",5,tier2_standard,True,True,False,cuprate,1987.0,valid
1,4,Ba0.1La1.9Cu1O4-Y,Ba0.1La1.9Cu1O4-Y,19.00,"{Cu, Ba, O, La}",4,tier2_standard,True,False,False,cuprate,1987.0,valid
2,5,Ba0.15La1.85Cu1O4-Y,Ba0.15La1.85Cu1O4-Y,22.00,"{Cu, Ba, O, La}",4,tier2_standard,True,True,False,cuprate,1987.0,valid
3,6,Ba0.3La1.7Cu1O4-Y,Ba0.3La1.7Cu1O4-Y,23.00,"{Cu, Ba, O, La}",4,tier2_standard,True,False,False,cuprate,1987.0,valid
4,7,Ba0.5La1.5Cu1O4-Y,Ba0.5La1.5Cu1O4-Y,23.00,"{Cu, Ba, O, La}",4,tier2_standard,True,False,False,cuprate,1987.0,valid
...,...,...,...,...,...,...,...,...,...,...,...,...,...
26243,159088,Ta1S2,Ta1S2,2.75,"{Ta, S}",2,tier1_strict,False,True,False,other,2020.0,valid
26244,159089,Ta1Se3,Ta1Se3,2.00,"{Ta, Se}",2,tier2_standard,False,True,False,other,2020.0,valid
26245,159090,Nb1Ir2B2,Nb1Ir2B2,7.20,"{B, Ir, Nb}",3,tier1_strict,False,False,False,niobium,2020.0,valid
26246,159091,Ta1Ir2B2,Ta1Ir2B2,5.10,"{Ta, B, Ir}",3,tier1_strict,False,False,False,other,2020.0,valid


### Handling duplicates

In [8]:

print("\n[3/7] Handling duplicate measurements...")

initial_count = len(df)

# Better approach: use transform and drop_duplicates instead of apply
# First, calculate aggregated Tc values
grouped = df.groupby('formula_normalized')['tc_kelvin'].agg(['mean', 'std', 'count']).reset_index()
grouped.columns = ['formula_normalized', 'tc_kelvin_agg', 'tc_std', 'n_measurements']

# For groups with high variability, remove outliers
for idx, row in grouped.iterrows():
    if row['tc_std'] > 5 and row['n_measurements'] > 1:
        formula = row['formula_normalized']
        formula_data = df[df['formula_normalized'] == formula]['tc_kelvin']
        mean_tc = formula_data.mean()
        std_tc = formula_data.std()
        
        # Remove outliers
        mask = (formula_data >= mean_tc - 2*std_tc) & (formula_data <= mean_tc + 2*std_tc)
        if mask.sum() > 0:
            corrected_mean = formula_data[mask].mean()
            grouped.at[idx, 'tc_kelvin_agg'] = corrected_mean

# Fill NaN std with 0
grouped['tc_std'] = grouped['tc_std'].fillna(0.0)

# Take first occurrence of each formula (to preserve metadata)
df_unique = df.drop_duplicates(subset='formula_normalized', keep='first').copy()

# Merge the aggregated Tc values
df_unique = df_unique.drop(columns=['tc_kelvin'])  # Remove original Tc
df_unique = df_unique.merge(grouped[['formula_normalized', 'tc_kelvin_agg', 'tc_std', 'n_measurements']], 
                             on='formula_normalized', how='left')
df_unique.rename(columns={'tc_kelvin_agg': 'tc_kelvin'}, inplace=True)

print(f"Records before aggregation: {initial_count}")
print(f"Unique formulas after aggregation: {len(df_unique)}")
print(f"Reduced by: {initial_count - len(df_unique)} ({100*(initial_count - len(df_unique))/initial_count:.1f}%)")

# Add measurement count column if not already present
if 'n_measurements' not in df_unique.columns:
    df_unique['n_measurements'] = 1
    df_unique['tc_std'] = 0.0

df = df_unique


[3/7] Handling duplicate measurements...
Records before aggregation: 26248
Unique formulas after aggregation: 17740
Reduced by: 8508 (32.4%)


### Clean formulas for MatMiner

In [9]:
print("\n[4/7] Preparing formulas for feature generation...")

def clean_formula_for_matminer(formula):
    """
    Clean formula for matminer compatibility:
    - Remove oxygen variability markers (-Y, -Z, -X)
    - Handle variable stoichiometry
    - Keep only valid element symbols and numbers
    """
    if pd.isna(formula):
        return None
    
    import re
    
    # Remove suffixes like -Y, -Z, -X
    formula = re.sub(r'-[XYZ]$', '', formula)
    
    # Remove +X patterns
    formula = re.sub(r'\+[XYZ]', '', formula)
    
    # Remove trailing variable indicators
    formula = formula.rstrip('xyzXYZ')
    
    # Handle some common issues
    # Replace Oz with O (common typo/notation)
    formula = formula.replace('Oz', 'O')
    
    return formula.strip()

df['formula_clean'] = df['formula_normalized'].apply(clean_formula_for_matminer)

# Remove any formulas that couldn't be cleaned
before_clean = len(df)
df = df[df['formula_clean'].notna() & (df['formula_clean'] != '')]
after_clean = len(df)

print(f"Removed {before_clean - after_clean} formulas that couldn't be cleaned")
print(f"Formulas ready for featurization: {len(df)}")

# Show examples
print("\nExample cleaned formulas:")
for i, (orig, clean) in enumerate(zip(df['formula_normalized'].head(5), 
                                       df['formula_clean'].head(5))):
    print(f"  {i+1}. {orig:40s} → {clean}")


[4/7] Preparing formulas for feature generation...
Removed 0 formulas that couldn't be cleaned
Formulas ready for featurization: 17740

Example cleaned formulas:
  1. Ba0.1La1.9Ag0.1Cu0.9O4-Y                 → Ba0.1La1.9Ag0.1Cu0.9O4
  2. Ba0.1La1.9Cu1O4-Y                        → Ba0.1La1.9Cu1O4
  3. Ba0.15La1.85Cu1O4-Y                      → Ba0.15La1.85Cu1O4
  4. Ba0.3La1.7Cu1O4-Y                        → Ba0.3La1.7Cu1O4
  5. Ba0.5La1.5Cu1O4-Y                        → Ba0.5La1.5Cu1O4


### Enhanced material classification

In [10]:
print("\n[5/7] Enhancing material classification...")

def classify_superconductor_enhanced(elements_set):
    """
    Enhanced material classification based on composition.
    More detailed than the original material_family.
    """
    if not elements_set or len(elements_set) == 0:
        return 'Unknown'
    
    # Convert to set if needed
    if isinstance(elements_set, str):
        elements_set = ast.literal_eval(elements_set)
    
    # Alkali metals
    alkali_metals = {'Li', 'Na', 'K', 'Rb', 'Cs', 'Fr'}
    
    # Classification logic (order matters - most specific first)
    
    # Hydrogen-rich (H > 50% atomic fraction would need composition, use presence for now)
    if 'H' in elements_set and len(elements_set) <= 3:
        return 'Hydrogen-rich'
    
    # Organic (alkali + C)
    if any(elem in alkali_metals for elem in elements_set) and 'C' in elements_set:
        return 'Organic'
    
    # Elemental
    if len(elements_set) == 1:
        return 'Elemental'
    
    # Cuprates (Cu + O, most important high-Tc)
    if 'Cu' in elements_set and 'O' in elements_set:
        return 'Cuprate'
    
    # Iron-based
    if 'Fe' in elements_set:
        # Sub-classify iron-based
        if 'As' in elements_set:
            return 'Iron-pnictide'  # FeAs-based
        elif 'Se' in elements_set or 'Te' in elements_set:
            return 'Iron-chalcogenide'  # FeSe/FeTe-based
        else:
            return 'Iron-based'
    
    # Heavy fermion (Ce, U, Pu compounds)
    if any(elem in {'Ce', 'U', 'Pu', 'Np'} for elem in elements_set):
        return 'Heavy-fermion'
    
    # Bismuthates
    if 'Bi' in elements_set and 'O' in elements_set:
        return 'Bismuthate'
    
    # Borocarbides
    if 'B' in elements_set and 'C' in elements_set:
        return 'Borocarbide'
    
    # Niobium compounds
    if 'Nb' in elements_set:
        if 'N' in elements_set:
            return 'Niobium-nitride'
        elif 'Se' in elements_set:
            return 'Niobium-selenide'
        else:
            return 'Niobium-based'
    
    # Mercury-based (often high-Tc)
    if 'Hg' in elements_set and 'O' in elements_set:
        return 'Mercury-cuprate'  # Usually also cuprates
    
    # Magnesium diboride family
    if 'Mg' in elements_set and 'B' in elements_set:
        return 'MgB2-type'
    
    # Ruthenates
    if 'Ru' in elements_set and 'O' in elements_set:
        return 'Ruthenate'
    
    # Cobaltates
    if 'Co' in elements_set and 'O' in elements_set:
        return 'Cobaltate'
    
    # Default
    return 'Other'

df['category_detailed'] = df['elements'].apply(classify_superconductor_enhanced)

print("\nDetailed material categories:")
print(df['category_detailed'].value_counts())




[5/7] Enhancing material classification...

Detailed material categories:
category_detailed
Cuprate              7524
Other                5896
Iron-pnictide        1131
Niobium-based        1023
Heavy-fermion         562
Iron-chalcogenide     322
Borocarbide           308
Bismuthate            254
Iron-based            206
MgB2-type             141
Niobium-nitride        82
Organic                73
Hydrogen-rich          69
Elemental              61
Niobium-selenide       47
Cobaltate              33
Ruthenate               4
Mercury-cuprate         4
Name: count, dtype: int64


### Install and import MatMiner if needed

In [11]:
print("\n[6/7] Setting up matminer for feature generation...")

try:
    from matminer.featurizers.conversions import StrToComposition
    from matminer.featurizers import composition as cf
    from matminer.featurizers.base import MultipleFeaturizer
    from matminer.featurizers.composition import ElementProperty
    from pymatgen.core.composition import Composition
    print("✓ Matminer and pymatgen already installed")
except ImportError:
    print("Installing matminer and pymatgen...")
    import subprocess
    import sys
    
    # Install matminer
    subprocess.check_call([sys.executable, "-m", "pip", "install", "matminer", "-q"])
    
    # Uninstall and reinstall specific pymatgen version
    #subprocess.check_call([sys.executable, "-m", "pip", "uninstall", "pymatgen", "-y", "-q"])
    #subprocess.check_call([sys.executable, "-m", "pip", "install", "pymatgen==2023.3.10", "-q"])
    
    # Import after installation
    from matminer.featurizers.conversions import StrToComposition
    from matminer.featurizers import composition as cf
    from matminer.featurizers.base import MultipleFeaturizer
    from matminer.featurizers.composition import ElementProperty
    from pymatgen.core.composition import Composition
    print("Matminer and pymatgen installed successfully")




[6/7] Setting up matminer for feature generation...
Installing matminer and pymatgen...
Matminer and pymatgen installed successfully


### Generate composition based features using Matminer

In [12]:
print("\n[7/7] Generating composition-based features...")
print("This may take several minutes...")

# Convert formula strings to Composition objects
print("\n  Converting formulas to compositions...")
df = StrToComposition().featurize_dataframe(df, "formula_clean", ignore_errors=True)

# Count how many failed
n_failed = df['composition'].isna().sum()
if n_failed > 0:
    print(f"  Warning: {n_failed} formulas failed conversion ({100*n_failed/len(df):.1f}%)")
    print(f"  Removing failed conversions...")
    df = df.dropna(subset=['composition'])
    print(f"  Remaining records: {len(df)}")

# Generate features
print("\n  Generating composition features...")
print("  This includes:")
print("    - Stoichiometry features")
print("    - Element property statistics (magpie)")
print("    - Valence orbital properties")
print("    - Ion properties")
print("    - Transition metal fraction")

feature_calculators = MultipleFeaturizer([
    cf.Stoichiometry(),                              # Basic stoichiometry
    cf.ElementProperty.from_preset("magpie"),        # 132 features from magpie
    cf.ValenceOrbital(props=['avg']),               # Valence orbital properties
    cf.IonProperty(fast=True),                      # Ion properties
    cf.TMetalFraction()                             # Transition metal content
])

feature_labels = feature_calculators.feature_labels()
print(f"\n  Generating {len(feature_labels)} features...")

df = feature_calculators.featurize_dataframe(df, col_id='composition', ignore_errors=True)

# Remove any rows where feature generation failed
initial_len = len(df)
df = df.dropna()
final_len = len(df)

if initial_len - final_len > 0:
    print(f"  Removed {initial_len - final_len} records with failed features")

print(f"\n Feature generation complete!")
print(f"  Final dataset: {final_len} records x {len(df.columns)} columns")
print(f"  Features generated: {len(feature_labels)}")


[7/7] Generating composition-based features...
This may take several minutes...

  Converting formulas to compositions...


StrToComposition:   0%|          | 0/17740 [00:00<?, ?it/s]

  Removing failed conversions...
  Remaining records: 16124

  Generating composition features...
  This includes:
    - Stoichiometry features
    - Element property statistics (magpie)
    - Valence orbital properties
    - Ion properties
    - Transition metal fraction

  Generating 146 features...


MultipleFeaturizer:   0%|          | 0/16124 [00:00<?, ?it/s]

  Removed 279 records with failed features

 Feature generation complete!
  Final dataset: 15845 records x 164 columns
  Features generated: 146


### Save data

In [13]:
print(f"\n[8/8] Saving feature-engineered data...")

import os
os.makedirs(OUTPUT_DIR, exist_ok=True)

def safe_save_csv(df, filepath):
    """Safely save dataframe, handling sets and complex objects"""
    df_save = df.copy()
    
    # Convert complex types to strings
    for col in df_save.columns:
        if df_save[col].dtype == 'object':
            sample = df_save[col].dropna().iloc[0] if len(df_save[col].dropna()) > 0 else None
            if sample is not None and not isinstance(sample, str):
                df_save[col] = df_save[col].astype(str)
    
    # Try CSV save
    try:
        df_save.to_csv(filepath, index=False)
        return True, "CSV"
    except Exception as e:
        # Fallback to pickle
        pickle_path = filepath.replace('.csv', '.pkl')
        df.to_pickle(pickle_path)
        return True, "PICKLE"

# Save full dataset
output_path = os.path.join(OUTPUT_DIR, OUTPUT_FILE)
success, format_type = safe_save_csv(df, output_path)

if format_type == "CSV":
    print(f" Saved CSV: {output_path}")
else:
    pickle_path = output_path.replace('.csv', '.pkl')
    print(f" CSV save failed due to pandas compatibility issue")
    print(f" Saved as pickle instead: {pickle_path}")
    print(f"  To load: df = pd.read_pickle('{pickle_path}')")

# Save feature list
feature_list_path = os.path.join(OUTPUT_DIR, 'feature_list.txt')
with open(feature_list_path, 'w') as f:
    f.write("COMPOSITION-BASED FEATURES\n")
    f.write("=" * 70 + "\n\n")
    f.write(f"Total features: {len(feature_labels)}\n\n")
    f.write("Feature categories:\n")
    f.write("  1. Stoichiometry features\n")
    f.write("  2. Element property statistics (magpie preset)\n")
    f.write("  3. Valence orbital properties\n")
    f.write("  4. Ion properties\n")
    f.write("  5. Transition metal fraction\n\n")
    f.write("Feature list:\n")
    f.write("-" * 70 + "\n")
    for i, feat in enumerate(feature_labels, 1):
        f.write(f"{i:3d}. {feat}\n")

print(f" Saved: {feature_list_path}")

# Save summary statistics
summary_path = os.path.join(OUTPUT_DIR, 'summary_statistics.txt')
with open(summary_path, 'w') as f:
    f.write("FEATURE ENGINEERING SUMMARY\n")
    f.write("=" * 70 + "\n\n")
    f.write(f"Input file: {INPUT_FILE}\n")
    f.write(f"Output file: {OUTPUT_FILE}\n\n")
    
    f.write("Data Statistics:\n")
    f.write(f"  Final records: {len(df)}\n")
    f.write(f"  Total columns: {len(df.columns)}\n")
    f.write(f"  Composition features: {len(feature_labels)}\n\n")
    
    f.write("Tc Statistics:\n")
    f.write(f"  Mean: {df['tc_kelvin'].mean():.2f} K\n")
    f.write(f"  Median: {df['tc_kelvin'].median():.2f} K\n")
    f.write(f"  Std: {df['tc_kelvin'].std():.2f} K\n")
    f.write(f"  Min: {df['tc_kelvin'].min():.2f} K\n")
    f.write(f"  Max: {df['tc_kelvin'].max():.2f} K\n\n")
    
    f.write("Material Categories:\n")
    for cat, count in df['category_detailed'].value_counts().items():
        f.write(f"  {cat}: {count} ({100*count/len(df):.1f}%)\n")

print(f" Saved: {summary_path}")



[8/8] Saving feature-engineered data...
 Saved CSV: /home/digifort/Documents/Data_Management_F25/supercon/feature_engineered_data/superconductors_with_features.csv
 Saved: /home/digifort/Documents/Data_Management_F25/supercon/feature_engineered_data/feature_list.txt
 Saved: /home/digifort/Documents/Data_Management_F25/supercon/feature_engineered_data/summary_statistics.txt


### Summary

In [14]:
print("\n" + "=" * 70)
print("FEATURE ENGINEERING COMPLETE!")
print("=" * 70)
print(f"\nFinal dataset shape: {df.shape}")
print(f"  Records: {len(df)}")
print(f"  Features: {len(feature_labels)}")
print(f"  Total columns: {len(df.columns)}")

print(f"\nOutput files:")
print(f"  1. {output_path}")
print(f"  2. {feature_list_path}")
print(f"  3. {summary_path}")

print(f"\nTop features by correlation with Tc:")
# Calculate correlations with Tc
correlations = df[feature_labels].corrwith(df['tc_kelvin']).abs().sort_values(ascending=False)
print(correlations.head(10).to_string())

print("\n" + "=" * 70)
print("Next steps:")
print("  1. Explore features in feature_engineered_data/")
print("  2. Build predictive models")
print("  3. Create dashboard with feature-enhanced data")
print("=" * 70)


FEATURE ENGINEERING COMPLETE!

Final dataset shape: (15845, 164)
  Records: 15845
  Features: 146
  Total columns: 164

Output files:
  1. /home/digifort/Documents/Data_Management_F25/supercon/feature_engineered_data/superconductors_with_features.csv
  2. /home/digifort/Documents/Data_Management_F25/supercon/feature_engineered_data/feature_list.txt
  3. /home/digifort/Documents/Data_Management_F25/supercon/feature_engineered_data/summary_statistics.txt

Top features by correlation with Tc:
MagpieData avg_dev GSvolume_pa          0.678444
max ionic char                          0.660387
MagpieData range Electronegativity      0.651378
MagpieData range GSvolume_pa            0.635451
avg ionic char                          0.625714
0-norm                                  0.617598
MagpieData maximum Electronegativity    0.612725
MagpieData range CovalentRadius         0.608645
MagpieData maximum GSvolume_pa          0.606707
MagpieData avg_dev Electronegativity    0.601974

Next steps:
 

In [18]:
df.head(20)

Unnamed: 0,data_number,chemical_formula,formula_normalized,elements,n_elements,quality_tier,has_oxygen_var,is_duplicate_formula,is_high_tc,material_family,...,MagpieData avg_dev SpaceGroupNumber,MagpieData mode SpaceGroupNumber,avg s valence electrons,avg p valence electrons,avg d valence electrons,avg f valence electrons,compound possible,max ionic char,avg ionic char,transition metal fraction
0,3,Ba0.1La1.9Ag0.1Cu0.9O4-Y,Ba0.1La1.9Ag0.1Cu0.9O4-Y,"{La, Cu, Ba, Ag, O}",5,tier2_standard,True,True,False,cuprate,...,94.77551,12.0,1.857143,2.285714,1.7,0.0,False,0.803211,0.164889,0.142857
1,4,Ba0.1La1.9Cu1O4-Y,Ba0.1La1.9Cu1O4-Y,"{Cu, Ba, O, La}",4,tier2_standard,True,False,False,cuprate,...,94.77551,12.0,1.857143,2.285714,1.7,0.0,False,0.803211,0.164951,0.142857
2,5,Ba0.15La1.85Cu1O4-Y,Ba0.15La1.85Cu1O4-Y,"{Cu, Ba, O, La}",4,tier2_standard,True,True,False,cuprate,...,95.061224,12.0,1.857143,2.285714,1.692857,0.0,False,0.803211,0.165284,0.142857
3,6,Ba0.3La1.7Cu1O4-Y,Ba0.3La1.7Cu1O4-Y,"{Cu, Ba, O, La}",4,tier2_standard,True,False,False,cuprate,...,95.918367,12.0,1.857143,2.285714,1.671429,0.0,False,0.803211,0.166278,0.142857
4,7,Ba0.5La1.5Cu1O4-Y,Ba0.5La1.5Cu1O4-Y,"{Cu, Ba, O, La}",4,tier2_standard,True,False,False,cuprate,...,97.061224,12.0,1.857143,2.285714,1.642857,0.0,False,0.803211,0.167587,0.142857
5,8,Ba1La1Cu1O4-Y,Ba1La1Cu1O4-Y,"{Cu, Ba, O, La}",4,tier2_standard,True,False,False,cuprate,...,99.918367,12.0,1.857143,2.285714,1.571429,0.0,False,0.803211,0.170782,0.142857
6,11,Sr0.1La1.9Cu1O4-Y,Sr0.1La1.9Cu1O4-Y,"{La, Cu, O, Sr}",4,tier2_standard,True,False,False,cuprate,...,94.710204,12.0,1.857143,2.285714,1.7,0.0,False,0.787757,0.164756,0.142857
7,12,Sr0.15La1.85Cu1O4-Y,Sr0.15La1.85Cu1O4-Y,"{La, Cu, O, Sr}",4,tier2_standard,True,False,False,cuprate,...,94.963265,12.0,1.857143,2.285714,1.692857,0.0,False,0.787757,0.164994,0.142857
8,13,Sr0.2La1.8Cu1O4-Y,Sr0.2La1.8Cu1O4-Y,"{La, Cu, O, Sr}",4,tier2_standard,True,False,False,cuprate,...,95.216327,12.0,1.857143,2.285714,1.685714,0.0,False,0.787757,0.16523,0.142857
9,14,Sr0.3La1.7Cu1O4-Y,Sr0.3La1.7Cu1O4-Y,"{La, Cu, O, Sr}",4,tier2_standard,True,False,False,cuprate,...,95.722449,12.0,1.857143,2.285714,1.671429,0.0,False,0.787757,0.165702,0.142857


In [16]:
df.to_csv('fin_v1.csv', index=False)