In [1]:
"""
Cross-tabs, Filters, and Quality Flags Analysis
================================================
Author: Ankita Biswas
Project: Public Dashboard for Superconductors
Date: December 2025

This script generates analyses 4 and 5 from the project proposal:
4. Cross-tabs and filters (element presence vs Tc bins, composition complexity)
5. Provenance & quality flags (data quality visualizations)

These outputs are specifically designed for dashboard integration.
"""



In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import warnings
warnings.filterwarnings('ignore')
import os
import ast

# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.dpi'] = 600
plt.rcParams['savefig.dpi'] = 600

  from pandas.core import (


### Configuration

In [None]:
INPUT_FILE = '/home/digifort/Documents/Data_Management_F25/supercon/feature_engineered_data/superconductors_with_features.csv'
OUTPUT_DIR = '/home/digifort/Documents/Data_Management_F25/supercon/analysis_results/'
FIGURES_DIR = os.path.join(OUTPUT_DIR, 'figures/')
TABLES_DIR = os.path.join(OUTPUT_DIR, 'tables/')

# Create directories
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(FIGURES_DIR, exist_ok=True)
os.makedirs(TABLES_DIR, exist_ok=True)

CROSS-TABS AND QUALITY FLAGS ANALYSIS


In [5]:
print("=" * 80)
print("CROSS-TABS AND QUALITY FLAGS ANALYSIS")
print("=" * 80)

print("\nLoading data...")
df = pd.read_csv(INPUT_FILE)

# Parse elements column
if 'elements' in df.columns and df['elements'].dtype == 'object':
    df['elements'] = df['elements'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

print(f"Loaded {len(df):,} records")

print("\n" + "=" * 80)
print("ANALYSIS 4: CROSS-TABS AND FILTERS")
print("=" * 80)


print("\n[4.1] Element presence cross-tabulation...")

# Get top elements
all_elements = []
for elem_set in df['elements'].dropna():
    if isinstance(elem_set, set):
        all_elements.extend(elem_set)

element_counts = Counter(all_elements)
top_elements = [e[0] for e in element_counts.most_common(15)]  # Top 15 elements

print(f"  Using top 15 elements: {', '.join(top_elements)}")

# Create binary presence columns
for elem in top_elements:
    df[f'has_{elem}'] = df['elements'].apply(
        lambda x: 1 if (isinstance(x, set) and elem in x) else 0
    )

# Create Tc bins
tc_bins = [0, 10, 30, 77, df['tc_kelvin'].max() + 1]
tc_labels = ['Very Low (<10K)', 'Low (10-30K)', 'Medium (30-77K)', 'High (>77K)']
df['tc_category'] = pd.cut(df['tc_kelvin'], bins=tc_bins, labels=tc_labels)

# Cross-tab: Tc category vs element presence
print("\nCreating cross-tabulation tables...")

# Method 1: Count of materials per Tc category with each element
crosstab_count = pd.DataFrame()
for elem in top_elements:
    counts = df.groupby(['tc_category', f'has_{elem}']).size().unstack(fill_value=0)
    if 1 in counts.columns:
        crosstab_count[elem] = counts[1]  # Only materials that contain the element
    else:
        crosstab_count[elem] = 0

crosstab_count.to_csv(os.path.join(TABLES_DIR, 'element_tc_crosstab_counts.csv'))
print("Saved: element_tc_crosstab_counts.csv")

print("\n Element presence by Tc category (count of materials):")
print(crosstab_count)

# Method 2: Percentage of materials in each Tc category that contain each element
crosstab_pct = pd.DataFrame()
for category in tc_labels:
    cat_df = df[df['tc_category'] == category]
    if len(cat_df) > 0:
        pcts = []
        for elem in top_elements:
            pct = 100 * cat_df[f'has_{elem}'].sum() / len(cat_df)
            pcts.append(pct)
        crosstab_pct[category] = pcts

crosstab_pct.index = top_elements
crosstab_pct.to_csv(os.path.join(TABLES_DIR, 'element_tc_crosstab_percentages.csv'))
print("Saved: element_tc_crosstab_percentages.csv")

# Visualize cross-tab as heatmap
fig, ax = plt.subplots(figsize=(12, 8))
sns.heatmap(crosstab_pct, annot=True, fmt='.1f', cmap='RdYlBu_r', 
            cbar_kws={'label': '% of Materials'}, ax=ax)
ax.set_xlabel('Tc Category', fontsize=12, fontweight='bold')
ax.set_ylabel('Element', fontsize=12, fontweight='bold')
ax.set_title('Element Prevalence by Tc Category (%)', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig(os.path.join(FIGURES_DIR, '04a_element_tc_heatmap.png'), dpi=300, bbox_inches='tight')
plt.close()
print("Saved: 04a_element_tc_heatmap.png")

CROSS-TABS AND QUALITY FLAGS ANALYSIS

Loading data...
Loaded 15,845 records

ANALYSIS 4: CROSS-TABS AND FILTERS

[4.1] Element presence cross-tabulation...
  Using top 15 elements: Cu, O, Ba, Sr, Ca, Y, La, Fe, Bi, As, Nb, Pb, C, Ni, Pr

Creating cross-tabulation tables...
Saved: element_tc_crosstab_counts.csv

 Element presence by Tc category (count of materials):
                   Cu     O    Ba    Sr    Ca     Y   La   Fe   Bi   As   Nb  \
tc_category                                                                    
Very Low (<10K)   487   789   331   331   165   396  832  437  545  296  683   
Low (10-30K)     1473  1689   690   826   385   406  911  855  255  665  492   
Medium (30-77K)  2745  2591  1698  1276  1153  1045  769  533  365  275    7   
High (>77K)      2322  1926  1656   835  1249   909  192   66  534    1    1   

                  Pb    C   Ni   Pr  
tc_category                          
Very Low (<10K)  335  511  550  190  
Low (10-30K)     140  326  197  233 

In [6]:
print("\n[4.2] Composition complexity analysis...")

# Number of elements vs Tc
complexity_stats = df.groupby('n_elements')['tc_kelvin'].agg([
    'count', 'mean', 'median', 'std', 'min', 'max'
]).round(2)
complexity_stats = complexity_stats[complexity_stats['count'] >= 5]  # At least 5 samples
complexity_stats.to_csv(os.path.join(TABLES_DIR, 'tc_by_complexity.csv'))

print("  Tc statistics by number of elements:")
print(complexity_stats)

# Plot
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Bar plot: Mean Tc vs n_elements
axes[0].bar(complexity_stats.index, complexity_stats['mean'], 
            yerr=complexity_stats['std'], capsize=5, 
            color='steelblue', edgecolor='black', alpha=0.7)
axes[0].axhline(77, color='red', linestyle='--', alpha=0.5, label='LN₂ temp (77K)')
axes[0].set_xlabel('Number of Elements', fontsize=11, fontweight='bold')
axes[0].set_ylabel('Mean Tc (K)', fontsize=11, fontweight='bold')
axes[0].set_title('Tc vs Compositional Complexity', fontsize=13, fontweight='bold')
axes[0].legend()
axes[0].grid(True, alpha=0.3, axis='y')

# Violin plot
valid_n_elements = complexity_stats.index.tolist()
data_to_plot = [df[df['n_elements'] == n]['tc_kelvin'].dropna() for n in valid_n_elements]

parts = axes[1].violinplot(data_to_plot, positions=valid_n_elements, widths=0.7, 
                           showmeans=True, showmedians=True)
axes[1].axhline(77, color='red', linestyle='--', alpha=0.5, label='LN₂ temp (77K)')
axes[1].set_xlabel('Number of Elements', fontsize=11, fontweight='bold')
axes[1].set_ylabel('Tc (K)', fontsize=11, fontweight='bold')
axes[1].set_title('Tc Distribution by Complexity', fontsize=13, fontweight='bold')
axes[1].legend()
axes[1].grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.savefig(os.path.join(FIGURES_DIR, '04b_complexity_vs_tc.png'), dpi=300, bbox_inches='tight')
plt.close()
print("Saved: 04b_complexity_vs_tc.png")


[4.2] Composition complexity analysis...
  Tc statistics by number of elements:
            count   mean  median    std    min     max
n_elements                                            
1              60   4.72    2.92   6.99   0.02   47.83
2            2483   4.86    3.37  10.54   0.01  294.00
3            3483   8.46    5.15   9.80   0.01  132.00
4            2907  25.27   13.65  29.25   0.03  287.00
5            3968  45.99   38.00  31.95   0.01  135.80
6            1988  54.43   55.40  31.80   0.20  143.00
7             751  67.93   71.30  29.54   0.40  133.00
8             192  79.08   85.48  30.49   6.00  136.00
9              13  82.48   79.80  26.38  45.30  123.50
Saved: 04b_complexity_vs_tc.png


In [7]:
print("\n[4.3] Composition keyword analysis...")

# Key element combinations for superconductors
element_combinations = {
    'Cu+O (Cuprates)': ['Cu', 'O'],
    'Fe+As/P (Pnictides)': ['Fe', ['As', 'P']],
    'Hg+Cu+O': ['Hg', 'Cu', 'O'],
    'Nb-based': ['Nb'],
    'Mg+B (MgB2-type)': ['Mg', 'B'],
    'La+Cu+O': ['La', 'Cu', 'O'],
    'Y+Ba+Cu+O': ['Y', 'Ba', 'Cu', 'O'],
    'Bi+Sr+Ca+Cu+O': ['Bi', 'Sr', 'Ca', 'Cu', 'O'],
    'Tl+Ba+Ca+Cu+O': ['Tl', 'Ba', 'Ca', 'Cu', 'O']
}

def has_element_combination(elements, combo):
    """Check if material contains required element combination"""
    if not isinstance(elements, set):
        return False
    
    for item in combo:
        if isinstance(item, list):
            # Any of the elements in the list
            if not any(e in elements for e in item):
                return False
        else:
            # Specific element required
            if item not in elements:
                return False
    return True

combo_stats = []
for combo_name, combo_elements in element_combinations.items():
    mask = df['elements'].apply(lambda x: has_element_combination(x, combo_elements))
    subset = df[mask]
    
    if len(subset) > 0:
        combo_stats.append({
            'combination': combo_name,
            'count': len(subset),
            'mean_tc': subset['tc_kelvin'].mean(),
            'median_tc': subset['tc_kelvin'].median(),
            'max_tc': subset['tc_kelvin'].max(),
            'high_tc_fraction': (subset['tc_kelvin'] > 77).sum() / len(subset)
        })

combo_df = pd.DataFrame(combo_stats).sort_values('mean_tc', ascending=False)
combo_df.to_csv(os.path.join(TABLES_DIR, 'keyword_combinations.csv'), index=False)

print("  Composition keyword statistics:")
print(combo_df.to_string(index=False))

# Plot
fig, ax = plt.subplots(figsize=(12, 7))
x_pos = np.arange(len(combo_df))
bars = ax.barh(x_pos, combo_df['mean_tc'], color='forestgreen', edgecolor='black', alpha=0.7)

# Color by mean Tc
colors = plt.cm.RdYlBu_r((combo_df['mean_tc'] - combo_df['mean_tc'].min()) / 
                         (combo_df['mean_tc'].max() - combo_df['mean_tc'].min()))
for bar, color in zip(bars, colors):
    bar.set_facecolor(color)

ax.set_yticks(x_pos)
ax.set_yticklabels(combo_df['combination'], fontsize=10)
ax.set_xlabel('Mean Tc (K)', fontsize=11, fontweight='bold')
ax.set_title('Mean Tc by Composition Keywords', fontsize=13, fontweight='bold')
ax.axvline(77, color='red', linestyle='--', alpha=0.5, label='LN₂ temp (77K)')
ax.legend()
ax.grid(True, alpha=0.3, axis='x')
plt.tight_layout()
plt.savefig(os.path.join(FIGURES_DIR, '04c_keyword_combinations.png'), dpi=300, bbox_inches='tight')
plt.close()
print(" Saved: 04c_keyword_combinations.png")


[4.3] Composition keyword analysis...
  Composition keyword statistics:
        combination  count   mean_tc  median_tc  max_tc  high_tc_fraction
      Tl+Ba+Ca+Cu+O    124 99.031113     108.00   125.5          0.854839
            Hg+Cu+O    237 90.149958      91.60   143.0          0.700422
      Bi+Sr+Ca+Cu+O    616 79.873164      83.90   136.0          0.626623
          Y+Ba+Cu+O   1609 66.090862      73.00   127.0          0.436917
    Cu+O (Cuprates)   5916 56.543981      58.00   275.0          0.324882
            La+Cu+O   1591 37.754881      30.50    99.0          0.106223
   Mg+B (MgB2-type)    192 30.191968      34.00    41.4          0.000000
Fe+As/P (Pnictides)   1150 21.867391      19.85   287.0          0.000870
           Nb-based   1183 10.106912       8.87    77.3          0.000845
 Saved: 04c_keyword_combinations.png


In [9]:
print("\n[4.4] Temporal trends analysis...")

if 'publication_year' in df.columns:
    # Filter valid years
    valid_years = df[(df['publication_year'] >= 1900) & (df['publication_year'] <= 2025)]
    
    year_stats = valid_years.groupby('publication_year').agg({
        'tc_kelvin': ['count', 'mean', 'median', 'max'],
        'is_high_tc': 'sum'
    }).reset_index()
    
    year_stats.columns = ['year', 'count', 'mean_tc', 'median_tc', 'max_tc', 'high_tc_count']
    year_stats['high_tc_fraction'] = year_stats['high_tc_count'] / year_stats['count']
    
    # Only keep years with at least 5 discoveries
    year_stats = year_stats[year_stats['count'] >= 5]
    year_stats.to_csv(os.path.join(TABLES_DIR, 'temporal_trends.csv'), index=False)
    
    # Plot
    fig, axes = plt.subplots(3, 1, figsize=(14, 12))
    
    # Number of discoveries
    axes[0].bar(year_stats['year'], year_stats['count'], 
                color='steelblue', edgecolor='black', alpha=0.7)
    axes[0].set_ylabel('Number of Materials', fontsize=11, fontweight='bold')
    axes[0].set_title('Superconductor Discoveries Over Time', fontsize=13, fontweight='bold')
    axes[0].grid(True, alpha=0.3, axis='y')
    
    # Maximum Tc evolution
    axes[1].plot(year_stats['year'], year_stats['max_tc'], 
                 'o-', linewidth=2, markersize=6, color='red', label='Max Tc')
    axes[1].plot(year_stats['year'], year_stats['mean_tc'], 
                 's-', linewidth=2, markersize=5, color='blue', alpha=0.7, label='Mean Tc')
    axes[1].axhline(77, color='green', linestyle='--', alpha=0.5, label='LN₂ temp (77K)')
    axes[1].set_ylabel('Tc (K)', fontsize=11, fontweight='bold')
    axes[1].set_title('Evolution of Tc Records', fontsize=13, fontweight='bold')
    axes[1].legend()
    axes[1].grid(True, alpha=0.3)
    
    # High-Tc fraction
    axes[2].fill_between(year_stats['year'], year_stats['high_tc_fraction'] * 100, 
                         alpha=0.5, color='purple')
    axes[2].plot(year_stats['year'], year_stats['high_tc_fraction'] * 100, 
                 linewidth=2, color='purple')
    axes[2].set_xlabel('Publication Year', fontsize=11, fontweight='bold')
    axes[2].set_ylabel('High-Tc Materials (%)', fontsize=11, fontweight='bold')
    axes[2].set_title('Fraction of High-Tc Materials (>77K)', fontsize=13, fontweight='bold')
    axes[2].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.savefig(os.path.join(FIGURES_DIR, '04d_temporal_trends.png'), dpi=300, bbox_inches='tight')
    plt.close()
    print("Saved: 04d_temporal_trends.png")


[4.4] Temporal trends analysis...
Saved: 04d_temporal_trends.png


In [11]:
print("\n[5.1] Quality tier analysis...")

if 'quality_tier' in df.columns:
    tier_stats = df.groupby('quality_tier').agg({
        'tc_kelvin': ['count', 'mean', 'median', 'std', 'min', 'max']
    }).round(2)
    tier_stats.columns = ['count', 'mean_tc', 'median_tc', 'std_tc', 'min_tc', 'max_tc']
    tier_stats.to_csv(os.path.join(TABLES_DIR, 'quality_tier_stats.csv'))
    
    print("  Quality tier statistics:")
    print(tier_stats)
    
    # Visualization
    fig, axes = plt.subplots(1, 2, figsize=(14, 6))
    
    # Pie chart of counts
    tier_counts = df['quality_tier'].value_counts()
    axes[0].pie(tier_counts.values, labels=tier_counts.index, autopct='%1.1f%%',
                startangle=90, colors=plt.cm.Set3(range(len(tier_counts))))
    axes[0].set_title('Distribution of Quality Tiers', fontsize=13, fontweight='bold')
    
    # Bar chart of mean Tc
    axes[1].bar(range(len(tier_stats)), tier_stats['mean_tc'], 
                yerr=tier_stats['std_tc'], capsize=5,
                color='steelblue', edgecolor='black', alpha=0.7)
    axes[1].set_xticks(range(len(tier_stats)))
    axes[1].set_xticklabels(tier_stats.index, rotation=45, ha='right')
    axes[1].set_ylabel('Mean Tc (K)', fontsize=11, fontweight='bold')
    axes[1].set_title('Mean Tc by Quality Tier', fontsize=13, fontweight='bold')
    axes[1].grid(True, alpha=0.3, axis='y')
    
    plt.tight_layout()
    plt.savefig(os.path.join(FIGURES_DIR, '05a_quality_tiers.png'), dpi=300, bbox_inches='tight')
    plt.close()
    print("Saved: 05a_quality_tiers.png")


[5.1] Quality tier analysis...
  Quality tier statistics:
                count  mean_tc  median_tc  std_tc  min_tc  max_tc
quality_tier                                                     
tier1_strict    14531    29.77       14.5   32.66    0.01   294.0
tier2_standard   1314    30.95       13.3   34.54    0.06   287.0
Saved: 05a_quality_tiers.png


In [12]:
print("\n[5.2] Data quality flags summary...")

quality_flags = {
    'Oxygen Variability': 'has_oxygen_var',
    'Duplicate Formula': 'is_duplicate_formula',
    'High-Tc (>77K)': 'is_high_tc',
    'Multiple Measurements': lambda: (df['n_measurements'] > 1).sum(),
    'Low Uncertainty': lambda: (df['tc_std'] < 1).sum(),
    'Recent Discovery (>2000)': lambda: (df['publication_year'] > 2000).sum()
}

flag_summary = []
for flag_name, flag_col in quality_flags.items():
    if callable(flag_col):
        count = flag_col()
    elif flag_col in df.columns:
        count = df[flag_col].sum()
    else:
        count = 0
    
    pct = 100 * count / len(df)
    flag_summary.append({
        'flag': flag_name,
        'count': count,
        'percentage': pct
    })

flag_df = pd.DataFrame(flag_summary)
flag_df.to_csv(os.path.join(TABLES_DIR, 'quality_flags_summary.csv'), index=False)

print("  Quality flags summary:")
print(flag_df.to_string(index=False))

# Visualization
fig, ax = plt.subplots(figsize=(12, 7))
colors = plt.cm.Set3(range(len(flag_df)))
bars = ax.barh(range(len(flag_df)), flag_df['percentage'], color=colors, edgecolor='black')

# Add count labels
for i, (bar, row) in enumerate(zip(bars, flag_df.itertuples())):
    width = bar.get_width()
    ax.text(width + 1, bar.get_y() + bar.get_height()/2, 
            f'{row.count:,}', ha='left', va='center', fontweight='bold')

ax.set_yticks(range(len(flag_df)))
ax.set_yticklabels(flag_df['flag'], fontsize=11)
ax.set_xlabel('Percentage of Materials (%)', fontsize=11, fontweight='bold')
ax.set_title('Data Quality Flags Distribution', fontsize=13, fontweight='bold')
ax.grid(True, alpha=0.3, axis='x')
plt.tight_layout()
plt.savefig(os.path.join(FIGURES_DIR, '05b_quality_flags.png'), dpi=300, bbox_inches='tight')
plt.close()
print("Saved: 05b_quality_flags.png")


[5.2] Data quality flags summary...
  Quality flags summary:
                    flag  count  percentage
      Oxygen Variability    447    2.821079
       Duplicate Formula   2881   18.182392
          High-Tc (>77K)   2346   14.805932
   Multiple Measurements   2875   18.144525
         Low Uncertainty  14696   92.748501
Recent Discovery (>2000)   4693   29.618176
Saved: 05b_quality_flags.png


In [13]:
print("\n[5.3] Measurement reliability analysis...")

if 'n_measurements' in df.columns and 'tc_std' in df.columns:
    # Materials with multiple measurements
    multi_meas = df[df['n_measurements'] > 1]
    
    reliability_stats = {
        'Total materials': len(df),
        'With multiple measurements': len(multi_meas),
        'Percentage': 100 * len(multi_meas) / len(df),
        'Mean measurements': df['n_measurements'].mean(),
        'Max measurements': df['n_measurements'].max(),
        'Mean Tc std (multi-meas)': multi_meas['tc_std'].mean(),
        'Materials with Tc std < 1K': (multi_meas['tc_std'] < 1).sum()
    }
    
    print("  Measurement reliability:")
    for key, value in reliability_stats.items():
        if 'Percentage' in key or 'Mean' in key or 'std' in key:
            print(f"    {key}: {value:.2f}")
        else:
            print(f"    {key}: {value:,}")
    
    # Plot: Tc std vs n_measurements
    fig, axes = plt.subplots(1, 2, figsize=(14, 6))
    
    # Scatter plot
    axes[0].scatter(multi_meas['n_measurements'], multi_meas['tc_std'], 
                   alpha=0.3, s=30, edgecolors='k', linewidth=0.5)
    axes[0].set_xlabel('Number of Measurements', fontsize=11, fontweight='bold')
    axes[0].set_ylabel('Tc Standard Deviation (K)', fontsize=11, fontweight='bold')
    axes[0].set_title('Measurement Variability', fontsize=13, fontweight='bold')
    axes[0].grid(True, alpha=0.3)
    
    # Distribution of Tc std
    axes[1].hist(multi_meas['tc_std'], bins=50, edgecolor='black', alpha=0.7, color='coral')
    axes[1].axvline(1, color='red', linestyle='--', linewidth=2, label='1K threshold')
    axes[1].set_xlabel('Tc Standard Deviation (K)', fontsize=11, fontweight='bold')
    axes[1].set_ylabel('Frequency', fontsize=11, fontweight='bold')
    axes[1].set_title('Distribution of Measurement Uncertainty', fontsize=13, fontweight='bold')
    axes[1].legend()
    axes[1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.savefig(os.path.join(FIGURES_DIR, '05c_measurement_reliability.png'), dpi=300, bbox_inches='tight')
    plt.close()
    print("Saved: 05c_measurement_reliability.png")


[5.3] Measurement reliability analysis...
  Measurement reliability:
    Total materials: 15,845
    With multiple measurements: 2,875
    Percentage: 18.14
    Mean measurements: 1.49
    Max measurements: 151
    Mean Tc std (multi-meas): 2.77
    Materials with Tc std < 1K: 1726.00
Saved: 05c_measurement_reliability.png


In [15]:
print("\n[5.4] Creating data quality badge system...")

def assign_quality_badge(row):
    """Assign quality badge based on multiple criteria"""
    score = 0
    badges = []
    
    # Tier 1 data
    if row.get('quality_tier') == 'tier1_strict':
        score += 3
        badges.append('Tier1')
    elif row.get('quality_tier') == 'tier2_standard':
        score += 2
        badges.append('Tier2')
    
    # No oxygen variability
    if not row.get('has_oxygen_var', True):
        score += 1
        badges.append('StableFormula')
    
    # Multiple measurements with low uncertainty
    if row.get('n_measurements', 1) > 1 and row.get('tc_std', 999) < 2:
        score += 2
        badges.append('ReliableMeasurement')
    
    # High-Tc material
    if row.get('is_high_tc', False):
        score += 1
        badges.append('HighTc')
    
    # Recent discovery
    if row.get('publication_year', 0) > 2000:
        score += 1
        badges.append('Modern')
    
    return score, ', '.join(badges) if badges else 'None'

# Apply badge system
df['quality_score'], df['quality_badges'] = zip(*df.apply(assign_quality_badge, axis=1))

# Badge distribution
badge_dist = df['quality_score'].value_counts().sort_index()
print("  Quality score distribution:")
print(badge_dist)

# Save badge system
badge_export = df[['data_number', 'chemical_formula', 'tc_kelvin', 
                   'quality_tier', 'quality_score', 'quality_badges']].copy()
badge_export.to_csv(os.path.join(TABLES_DIR, 'quality_badge_system.csv'), index=False)
print("Saved: quality_badge_system.csv")

# Visualization
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Score distribution
axes[0].bar(badge_dist.index, badge_dist.values, 
            color='steelblue', edgecolor='black', alpha=0.7)
axes[0].set_xlabel('Quality Score', fontsize=11, fontweight='bold')
axes[0].set_ylabel('Number of Materials', fontsize=11, fontweight='bold')
axes[0].set_title('Quality Score Distribution', fontsize=13, fontweight='bold')
axes[0].grid(True, alpha=0.3, axis='y')

# Mean Tc by quality score
score_tc = df.groupby('quality_score')['tc_kelvin'].agg(['mean', 'std', 'count'])
axes[1].bar(score_tc.index, score_tc['mean'], yerr=score_tc['std'], 
            capsize=5, color='coral', edgecolor='black', alpha=0.7)
axes[1].set_xlabel('Quality Score', fontsize=11, fontweight='bold')
axes[1].set_ylabel('Mean Tc (K)', fontsize=11, fontweight='bold')
axes[1].set_title('Mean Tc by Quality Score', fontsize=13, fontweight='bold')
axes[1].grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.savefig(os.path.join(FIGURES_DIR, '05d_quality_badges.png'), dpi=300, bbox_inches='tight')
plt.close()
print("Saved: 05d_quality_badges.png")


[5.4] Creating data quality badge system...
  Quality score distribution:
quality_score
2     280
3     165
4    7631
5    6098
6    1102
7     561
8       8
Name: count, dtype: int64
Saved: quality_badge_system.csv
Saved: 05d_quality_badges.png


In [16]:
print("\n[5.3] Measurement reliability analysis...")

if 'n_measurements' in df.columns and 'tc_std' in df.columns:
    # Materials with multiple measurements
    multi_meas = df[df['n_measurements'] > 1]
    
    reliability_stats = {
        'Total materials': len(df),
        'With multiple measurements': len(multi_meas),
        'Percentage': 100 * len(multi_meas) / len(df),
        'Mean measurements': df['n_measurements'].mean(),
        'Max measurements': df['n_measurements'].max(),
        'Mean Tc std (multi-meas)': multi_meas['tc_std'].mean(),
        'Materials with Tc std < 1K': (multi_meas['tc_std'] < 1).sum()
    }
    
    print("  Measurement reliability:")
    for key, value in reliability_stats.items():
        if 'Percentage' in key or 'Mean' in key or 'std' in key:
            print(f"    {key}: {value:.2f}")
        else:
            print(f"    {key}: {value:,}")
    
    # Plot: Tc std vs n_measurements
    fig, axes = plt.subplots(1, 2, figsize=(14, 6))
    
    # Scatter plot
    axes[0].scatter(multi_meas['n_measurements'], multi_meas['tc_std'], 
                   alpha=0.3, s=30, edgecolors='k', linewidth=0.5)
    axes[0].set_xlabel('Number of Measurements', fontsize=11, fontweight='bold')
    axes[0].set_ylabel('Tc Standard Deviation (K)', fontsize=11, fontweight='bold')
    axes[0].set_title('Measurement Variability', fontsize=13, fontweight='bold')
    axes[0].grid(True, alpha=0.3)
    
    # Distribution of Tc std
    axes[1].hist(multi_meas['tc_std'], bins=50, edgecolor='black', alpha=0.7, color='coral')
    axes[1].axvline(1, color='red', linestyle='--', linewidth=2, label='1K threshold')
    axes[1].set_xlabel('Tc Standard Deviation (K)', fontsize=11, fontweight='bold')
    axes[1].set_ylabel('Frequency', fontsize=11, fontweight='bold')
    axes[1].set_title('Distribution of Measurement Uncertainty', fontsize=13, fontweight='bold')
    axes[1].legend()
    axes[1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.savefig(os.path.join(FIGURES_DIR, '05c_measurement_reliability.png'), dpi=300, bbox_inches='tight')
    plt.close()
    print("Saved: 05c_measurement_reliability.png")


[5.3] Measurement reliability analysis...
  Measurement reliability:
    Total materials: 15,845
    With multiple measurements: 2,875
    Percentage: 18.14
    Mean measurements: 1.49
    Max measurements: 151
    Mean Tc std (multi-meas): 2.77
    Materials with Tc std < 1K: 1726.00
Saved: 05c_measurement_reliability.png


In [17]:
print("\n[5.4] Creating data quality badge system...")

def assign_quality_badge(row):
    """Assign quality badge based on multiple criteria"""
    score = 0
    badges = []
    
    # Tier 1 data
    if row.get('quality_tier') == 'tier1_strict':
        score += 3
        badges.append('Tier1')
    elif row.get('quality_tier') == 'tier2_standard':
        score += 2
        badges.append('Tier2')
    
    # No oxygen variability
    if not row.get('has_oxygen_var', True):
        score += 1
        badges.append('StableFormula')
    
    # Multiple measurements with low uncertainty
    if row.get('n_measurements', 1) > 1 and row.get('tc_std', 999) < 2:
        score += 2
        badges.append('ReliableMeasurement')
    
    # High-Tc material
    if row.get('is_high_tc', False):
        score += 1
        badges.append('HighTc')
    
    # Recent discovery
    if row.get('publication_year', 0) > 2000:
        score += 1
        badges.append('Modern')
    
    return score, ', '.join(badges) if badges else 'None'

# Apply badge system
df['quality_score'], df['quality_badges'] = zip(*df.apply(assign_quality_badge, axis=1))

# Badge distribution
badge_dist = df['quality_score'].value_counts().sort_index()
print("Quality score distribution:")
print(badge_dist)

# Save badge system
badge_export = df[['data_number', 'chemical_formula', 'tc_kelvin', 
                   'quality_tier', 'quality_score', 'quality_badges']].copy()
badge_export.to_csv(os.path.join(TABLES_DIR, 'quality_badge_system.csv'), index=False)
print("Saved: quality_badge_system.csv")

# Visualization
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Score distribution
axes[0].bar(badge_dist.index, badge_dist.values, 
            color='steelblue', edgecolor='black', alpha=0.7)
axes[0].set_xlabel('Quality Score', fontsize=11, fontweight='bold')
axes[0].set_ylabel('Number of Materials', fontsize=11, fontweight='bold')
axes[0].set_title('Quality Score Distribution', fontsize=13, fontweight='bold')
axes[0].grid(True, alpha=0.3, axis='y')

# Mean Tc by quality score
score_tc = df.groupby('quality_score')['tc_kelvin'].agg(['mean', 'std', 'count'])
axes[1].bar(score_tc.index, score_tc['mean'], yerr=score_tc['std'], 
            capsize=5, color='coral', edgecolor='black', alpha=0.7)
axes[1].set_xlabel('Quality Score', fontsize=11, fontweight='bold')
axes[1].set_ylabel('Mean Tc (K)', fontsize=11, fontweight='bold')
axes[1].set_title('Mean Tc by Quality Score', fontsize=13, fontweight='bold')
axes[1].grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.savefig(os.path.join(FIGURES_DIR, '05d_quality_badges.png'), dpi=300, bbox_inches='tight')
plt.close()
print("Saved: 05d_quality_badges.png")


[5.4] Creating data quality badge system...
Quality score distribution:
quality_score
2     280
3     165
4    7631
5    6098
6    1102
7     561
8       8
Name: count, dtype: int64
Saved: quality_badge_system.csv
Saved: 05d_quality_badges.png
