# Statistical analysis of perovskite data

In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
from scipy import stats
from collections import Counter
import networkx as nx
import warnings
import re
from plotly_theme import register_template, scatter_plot, set_defaults, SINGLE_COLUMN_PX, DOUBLE_COLUMN_PX
import plotly.express as px
import plotly.graph_objects as go
register_template()
set_defaults()

In [2]:
df = pd.read_parquet('pvdb.parquet')

In [3]:
list(df.columns)

['data.add.lay_back',
 'data.add.lay_back_deposition_aggregation_state_of_reactants',
 'data.add.lay_back_deposition_procedure',
 'data.add.lay_back_deposition_reaction_solutions_age',
 'data.add.lay_back_deposition_reaction_solutions_compounds_purity',
 'data.add.lay_back_deposition_reaction_solutions_compounds_supplier',
 'data.add.lay_back_deposition_reaction_solutions_temperature',
 'data.add.lay_back_deposition_reaction_solutions_volumes',
 'data.add.lay_back_deposition_solvents',
 'data.add.lay_back_deposition_solvents_purity',
 'data.add.lay_back_deposition_solvents_supplier',
 'data.add.lay_back_deposition_substrate_temperature',
 'data.add.lay_back_deposition_synthesis_atmosphere',
 'data.add.lay_back_deposition_thermal_annealing_atmosphere',
 'data.add.lay_back_deposition_thermal_annealing_temperature',
 'data.add.lay_back_deposition_thermal_annealing_time',
 'data.add.lay_back_function',
 'data.add.lay_back_stack_sequence',
 'data.add.lay_back_storage_atmosphere',
 'data.add

In [4]:
material_normalization = {
    # Spiro-OMeTAD variants
    'Spiro-MeOTAD': 'Spiro-OMeTAD',
    'Spiro-OMETAD': 'Spiro-OMeTAD',
    'Spiro-oMeTAD': 'Spiro-OMeTAD',
    'spiro-OMeTAD': 'Spiro-OMeTAD',
    'spiro-MeOTAD': 'Spiro-OMeTAD',
    
    # PCBM variants
    'PCBM-60': 'PCBM',
    'PCBM60': 'PCBM',
    'PC60BM': 'PCBM',
    'PC61BM': 'PCBM',
    
    # PEDOT:PSS variants
    'PEDOT-PSS': 'PEDOT:PSS',
    'PEDOT/PSS': 'PEDOT:PSS',
    'PEDOT PSS': 'PEDOT:PSS',
}

def normalize_materials(material_array, norm_dict):
    if not isinstance(material_array, np.ndarray):
        return material_array
    return np.array([norm_dict.get(m, m) for m in material_array])

print("Normalizing material names...")
for col in ['results.properties.optoelectronic.solar_cell.hole_transport_layer',
            'results.properties.optoelectronic.solar_cell.electron_transport_layer',
            'results.properties.optoelectronic.solar_cell.device_stack',
            'results.properties.optoelectronic.solar_cell.absorber']:
    df[col] = df[col].apply(lambda x: normalize_materials(x, material_normalization))

print("Normalization complete.")

Normalizing material names...
Normalization complete.


In [5]:
def filter_theory_papers(df, text_column='data.ref.free_text_comment'):
    theory_keywords = [
        r'\bDFT\b', r'\bSCAPS\b', r'\bSCAPS-1D\b', r'density functional', r'first.?principles', 
        r'ab.?initio', r'molecular dynamics', r'\bMD\b simulation', r'VASP', r'Gaussian', 
        r'Quantum ESPRESSO', r'CASTEP', r'SIESTA', r'computational study', r'theoretical study', 
        r'theoretical investigation', r'numerical simulation', r'device simulation', 
        r'theoretical analysis', r'computational analysis', r'theoretical modeling', 
        r'computational modeling', r'simulated', r'simulation of', r'wxAMPS', r'AMPS-1D', 
        r'PC1D', r'AFORS-HET', r'theoretical optimization', r'computational optimization',
        r'simulated performance', r'theoretical efficiency', r'predicted efficiency',
    ]
    pattern = '|'.join(theory_keywords)
    if text_column not in df.columns: return df, pd.Series(False, index=df.index)
    theory_mask = df[text_column].fillna('').str.contains(pattern, case=False, regex=True, na=False)
    return df[~theory_mask].copy(), theory_mask

In [6]:
def filter_non_solar_cell_papers(df, text_column='data.ref.free_text_comment'):
    if text_column not in df.columns: return df, pd.Series(False, index=df.index)
    def extract_title(text):
        if pd.isna(text) or 'Publication title:' not in str(text): return ''
        try: return text.split('Publication title:')[1].split(',')[0].strip().lower()
        except: return str(text).lower()
    titles = df[text_column].apply(extract_title)
    non_solar_keywords = {
        'LED': [r'\bLED\b', r'light.?emitting diode', r'electroluminescen'],
        'Battery': [r'\bbattery\b', r'energy storage', r'rechargeable'],
        'Photodetector': [r'photodetector', r'X.?ray detector'],
        'Catalyst': [r'catalys', r'water splitting', r'hydrogen evolution'],
        'Other': [r'sensor', r'transistor', r'laser', r'memory', r'thermoelectric', r'capacitor']
    }
    solar_cell_keywords = [r'solar cell', r'photovoltaic', r'\bPV\b', r'\bPSC\b', r'\bPCE\b']
    non_solar_mask = pd.Series(False, index=df.index)
    for patterns in non_solar_keywords.values():
        non_solar_mask |= titles.str.contains('|'.join(patterns), case=False, regex=True, na=False)
    is_solar_cell = titles.str.contains('|'.join(solar_cell_keywords), case=False, regex=True, na=False)
    final_mask = non_solar_mask & ~is_solar_cell
    return df[~final_mask].copy(), final_mask

In [7]:
def filter_review_articles(df, text_column='data.ref.free_text_comment'):
    if text_column not in df.columns: return df, pd.Series(False, index=df.index)
    def extract_title(text):
        if pd.isna(text) or 'Publication title:' not in str(text): return ''
        try: return text.split('Publication title:')[1].split(',')[0].strip()
        except: return ''
    titles = df[text_column].apply(extract_title)
    review_patterns = [
        r'^Review\b', r'^Perspective\b', r'^Overview\b', r'^Outlook\b', r'^Minireview\b',
        r'^Critical [Rr]eview\b', r': [Aa] [Rr]eview\b', r': [Aa] [Pp]erspective\b',
        r'\b[Rr]eview of\b', r'\b[Rr]eview on\b', r'^Progress in\b', r'^Recent [Aa]dvances\b',
        r'^Advances in\b', r'^State of the art\b', r'^Current status\b'
    ]
    pattern = '|'.join(review_patterns)
    review_mask = titles.str.contains(pattern, case=False, regex=True, na=False)
    return df[~review_mask].copy(), review_mask


In [8]:
def filter_experimental_solar_cells(df, text_column='data.ref.free_text_comment', verbose=True):
    df_step1, theory_mask = filter_theory_papers(df, text_column)
    df_step2, non_solar_mask = filter_non_solar_cell_papers(df_step1, text_column)
    df_step3, review_mask = filter_review_articles(df_step2, text_column)
    if verbose:
        print("="*70)
        print("FILTERING SUMMARY")
        print("="*70)
        print(f"Original entries:              {len(df):,}")
        print(f"Theory papers removed:         {theory_mask.sum():,}")
        print(f"Non-solar cell papers removed: {non_solar_mask.sum():,}")
        print(f"Review articles removed:       {review_mask.sum():,}")
        print(f"Total filtered out:            {theory_mask.sum() + non_solar_mask.sum() + review_mask.sum():,}")
        print(f"Experimental research papers:  {len(df_step3):,}")
        print("="*70)
    return df_step3

In [18]:
def cleanup_material_morphology(df, columns_to_clean):
    """
    Cleans material names by removing morphological/processing affixes.

    This function strips common prefixes (c-, mp-), suffixes (-c, -mp, -np),
    and descriptors like dopants (:F4-TCNQ), additives (w/ TBP), and
    formulations (QDs) to leave only the core chemical composition.

    Parameters:
    -----------
    df : pandas.DataFrame
        Input dataframe.
    columns_to_clean : list of str
        A list of column names containing material arrays to be cleaned.

    Returns:
    --------
    df_cleaned : pandas.DataFrame
        A new dataframe with the specified columns cleaned.
    """
    df_cleaned = df.copy()

    def clean_single_material(name):
        if not isinstance(name, str):
            return name
        
        # Remove common suffixes like -c, -mp, -np, -bl, etc.
        cleaned_name = re.sub(r'-[a-zA-Z]+$', '', name)
        # Remove common prefixes like c-, mp-
        cleaned_name = re.sub(r'^[a-z]+-', '', cleaned_name)
        # Remove dopants and additives specified with ":"
        cleaned_name = re.sub(r':.*', '', cleaned_name)
        # Remove additives specified with "w/"
        cleaned_name = re.sub(r'\s*w/.*', '', cleaned_name)
        # Remove anything in parentheses (e.g., treatments)
        cleaned_name = re.sub(r'\s*\(.*\)$', '', cleaned_name)
        # Remove "QDs" or "QD"
        cleaned_name = re.sub(r'\s*QDs?$', '', cleaned_name)
        # Remove "mesoporous layer" and similar descriptors
        cleaned_name = re.sub(r'\s*mesoporous.*', '', cleaned_name)
        
        return cleaned_name.strip()

    def clean_material_array(material_array):
        if not isinstance(material_array, np.ndarray):
            return material_array
        return np.array([clean_single_material(m) for m in material_array])

    print("Cleaning material names to core compositions...")
    for col in columns_to_clean:
        if col in df_cleaned.columns:
            df_cleaned[col] = df_cleaned[col].apply(clean_material_array)
            print(f"  - Column '{col}' cleaned.")
    
    return df_cleaned

columns_to_process = [
    'results.properties.optoelectronic.solar_cell.hole_transport_layer',
    'results.properties.optoelectronic.solar_cell.electron_transport_layer',
    'results.properties.optoelectronic.solar_cell.device_stack'
]
df_composition = cleanup_material_morphology(df, columns_to_process)

Cleaning material names to core compositions...
  - Column 'results.properties.optoelectronic.solar_cell.hole_transport_layer' cleaned.
  - Column 'results.properties.optoelectronic.solar_cell.electron_transport_layer' cleaned.
  - Column 'results.properties.optoelectronic.solar_cell.device_stack' cleaned.


In [24]:
df_composition['results.properties.optoelectronic.solar_cell.hole_transport_layer'].value_counts()

results.properties.optoelectronic.solar_cell.hole_transport_layer
[Spiro]              25197
[PEDOT]               7523
[PTAA]                2618
[none]                2599
[NiO]                 2343
                     ...  
[Spiro, MoOx]            1
[Graphene, Spiro]        1
[C60, BCP]               1
[NiOx, PTAA]             1
[TPDCN]                  1
Name: count, Length: 4015, dtype: int64

In [26]:
df['results.properties.optoelectronic.solar_cell.hole_transport_layer'].value_counts()

results.properties.optoelectronic.solar_cell.hole_transport_layer
[Spiro-OMeTAD]               25138
[PEDOT:PSS]                   7374
[none]                        2599
[PTAA]                        2564
[NiO-c]                       1661
                             ...  
[Pt-np]                          1
[CsPbI3-QD, Spiro-OMeTAD]        1
[PEDOT:PSS, PTAA]                1
[D18-Cl, Cl-2PACz]               1
[TPDCN]                          1
Name: count, Length: 4373, dtype: int64

In [27]:
df_clean = filter_experimental_solar_cells(df_composition)

# Final data preparation steps
df_clean['pub_year'] = pd.to_datetime(df_clean['data.ref.publication_date']).dt.year
df_clean['extraction_method'] = df_clean['data.ref.extraction_method'].fillna('Old')

def get_stack_length(stack):
    if not isinstance(stack, np.ndarray):
        return np.nan
    return len([s for s in stack if s != 'SLG'])

df_clean['stack_length'] = df_clean['results.properties.optoelectronic.solar_cell.device_stack'].apply(get_stack_length)


FILTERING SUMMARY
Original entries:              53,238
Theory papers removed:         517
Non-solar cell papers removed: 26
Review articles removed:       95
Total filtered out:            638
Experimental research papers:  52,600


In [28]:
print(len(df_clean[df_clean['extraction_method']=='Old']), "entries extracted with old method.")
print(len(df_clean[df_clean['extraction_method']=='LLM']), "entries extracted with LLM")

42777 entries extracted with old method.
9823 entries extracted with LLM


In [None]:
nip_data = df_clean[df_clean['results.properties.optoelectronic.solar_cell.device_architecture'] == 'nip']
pin_data = df_clean[df_clean['results.properties.optoelectronic.solar_cell.device_architecture'] == 'pin']

# Group by year and calculate statistics
nip_by_year = nip_data.groupby('pub_year')['stack_length'].agg(['mean', 'std', 'count']).reset_index()
pin_by_year = pin_data.groupby('pub_year')['stack_length'].agg(['mean', 'std', 'count']).reset_index()

# Apply rolling window smoothing (2-year window, centered)
window_size = 2
nip_by_year['mean_smooth'] = nip_by_year['mean'].rolling(window=window_size, center=True, min_periods=1).mean()
pin_by_year['mean_smooth'] = pin_by_year['mean'].rolling(window=window_size, center=True, min_periods=1).mean()

# Calculate regression lines
nip_slope, nip_intercept, nip_r, nip_p, _ = stats.linregress(nip_by_year['pub_year'], nip_by_year['mean'])
pin_slope, pin_intercept, pin_r, pin_p, _ = stats.linregress(pin_by_year['pub_year'], pin_by_year['mean'])

nip_by_year['regression'] = nip_slope * nip_by_year['pub_year'] + nip_intercept
pin_by_year['regression'] = pin_slope * pin_by_year['pub_year'] + pin_intercept

# Create plotly figure
fig = go.Figure()
fig.update_layout(
    autosize=False,
    width=SINGLE_COLUMN_PX,
    height=SINGLE_COLUMN_PX /3  ,
)

# Add NIP smoothed line
fig.add_trace(go.Scatter(
    x=nip_by_year['pub_year'],
    y=nip_by_year['mean_smooth'],
    mode='lines+markers',
    name='NIP (simplifying)',
    line=dict(color='#2E86AB', width=2),
    marker=dict(size=4),
    hovertemplate='<b>NIP</b><br>Year: %{x}<br>Avg Stack Length: %{y:.2f}<extra></extra>'
))

# Add raw data points (smaller, transparent)
fig.add_trace(go.Scatter(
    x=nip_by_year['pub_year'],
    y=nip_by_year['mean'],
    mode='markers',
    name='NIP (raw)',
    marker=dict(size=2, color='#2E86AB', opacity=0.3),
    showlegend=False,
    hovertemplate='<b>NIP (raw)</b><br>Year: %{x}<br>Avg: %{y:.2f}<extra></extra>'
))

# Add regression line
fig.add_trace(go.Scatter(
    x=nip_by_year['pub_year'],
    y=nip_by_year['regression'],
    mode='lines',
    name='NIP trend',      
    line=dict(color='#145374', width=1.5, dash='dash'),
    opacity=0.6,
    showlegend=False,
))

# Add PIN smoothed line
fig.add_trace(go.Scatter(
    x=pin_by_year['pub_year'],
    y=pin_by_year['mean_smooth'],
    mode='lines+markers',
    name='PIN (complexifying)',
    line=dict(color='#F26419', width=2),
    marker=dict(size=4),
    hovertemplate='<b>PIN</b><br>Year: %{x}<br>Avg Stack Length: %{y:.2f}<extra></extra>'
))

# Add raw data points (smaller, transparent)
fig.add_trace(go.Scatter(
    x=pin_by_year['pub_year'],
    y=pin_by_year['mean'],
    mode='markers',
    name='PIN (raw)',
    marker=dict(size=2, color='#F26419', opacity=0.3),
    showlegend=False,
    hovertemplate='<b>PIN (raw)</b><br>Year: %{x}<br>Avg: %{y:.2f}<extra></extra>'
))

# Add regression line
fig.add_trace(go.Scatter(
    x=pin_by_year['pub_year'],
    y=pin_by_year['regression'],
    mode='lines',
    name='PIN trend',      
    line=dict(color='#9C2A00', width=1.5, dash='dash'),
    opacity=0.6,
    showlegend=False,
))

# Update layout
fig.update_layout(
    xaxis_title='Publication Year',
    yaxis_title='Average Device Stack Length',
)

fig.show()

In [30]:
# plot fraction of NIP/PIN over year

fig = go.Figure()
fig.update_layout(
    autosize=False,
    width=SINGLE_COLUMN_PX,
    height=SINGLE_COLUMN_PX/3,
)

relative_nip = nip_by_year['count'] / (nip_by_year['count'] + pin_by_year['count'])

fig.add_trace(go.Scatter(
    x=nip_by_year['pub_year'],
    y=relative_nip,
    mode='lines+markers',
    name='NIP',
    line=dict(color='#2E86AB', width=3),
    marker=dict(size=8, symbol='circle'),
    hovertemplate='<b>Relative NIP Fraction</b><br>Year: %{x}<br>Fraction: %{y:.2f}<extra></extra>'
))

relative_pin= pin_by_year['count'] / (nip_by_year['count'] + pin_by_year['count'])
fig.add_trace(go.Scatter(
    x=pin_by_year['pub_year'],
    y=relative_pin,
    mode='lines+markers',
    name='PIN',
    line=dict(color='#F26419', width=3),
    marker=dict(size=8, symbol='square'),
    hovertemplate='<b>Relative PIN Fraction</b><br>Year: %{x}<br>Fraction: %{y:.2f}<extra></extra>'
))


fig.update_layout(
    xaxis_title='Publication Year',
    yaxis_title='Relative Fraction',
)

In [None]:
# plot relative ratios of ETL/HTL materials over time

