<a href="https://colab.research.google.com/github/Ankur-gitbit/PhD_Codes/blob/main/AFM_forcespec_data_extractor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# %% [markdown]
# # AFM Force Spectroscopy Data Processor - Fresh Start
#
# This notebook processes multiple TSV files from AFM force spectroscopy experiments on hydrogels and combines them into an organized Excel file.
# **NOTE: This notebook will CLEAR all previously generated files before starting.**
#

# %% [markdown]
# ## 0. CLEAR ALL PREVIOUS FILES (START FRESH)

# %%
import os
import glob
import shutil
from google.colab import files
import pandas as pd
import numpy as np
import zipfile
import io
import warnings
from tqdm import tqdm
import matplotlib.pyplot as plt
from datetime import datetime
import seaborn as sns

warnings.filterwarnings('ignore')

def clear_previous_files():
    """Remove all previously generated files and directories"""
    print("üî¥ CLEARING ALL PREVIOUS FILES...")

    # List of directories and files to remove
    targets = [
        'uploaded_files',
        'AFM_Force_Spectroscopy_Combined.xlsx',
        'AFM_Data_Visualization.png',
        '*.xlsx',
        '*.png',
        '*.pdf',
        'collagen_*',
        'hydrogel_*',
        'results_*'
    ]

    files_removed = 0
    directories_removed = 0

    # Remove specific files
    for file_pattern in ['*.xlsx', '*.png', '*.pdf', 'AFM_*', 'collagen_*', 'hydrogel_*', 'results_*']:
        for filepath in glob.glob(file_pattern):
            try:
                os.remove(filepath)
                print(f"  Removed file: {filepath}")
                files_removed += 1
            except Exception as e:
                pass

    # Remove directories
    for dir_path in ['uploaded_files', 'processed_data', 'output', 'uploaded_files_NEW']:
        if os.path.exists(dir_path):
            try:
                shutil.rmtree(dir_path)
                print(f"  Removed directory: {dir_path}")
                directories_removed += 1
            except Exception as e:
                pass

    # Clear output in Colab
    from IPython.display import clear_output
    clear_output(wait=True)

    print(f"\n‚úÖ Cleanup complete!")
    print(f"   Removed {files_removed} files and {directories_removed} directories")
    print("=" * 60)
    print("You can now upload NEW hydrogel sample data.")
    print("=" * 60)
    print("\n")

# Execute cleanup
clear_previous_files()

# %% [markdown]
# ## 1. Install Required Libraries

# %%
# Install required packages
!pip install pandas openpyxl tqdm seaborn -q

# %% [markdown]
# ## 2. Get Project/Sample Name from User

# %%
def get_project_name():
    """Get project/sample name from user to prefix all output files"""
    print("üìù ENTER PROJECT/SAMPLE NAME")
    print("-" * 40)
    print("This name will be used as a prefix for all output files.")
    print("Examples: 'collagen', 'PEG_10percent', 'alginate_UVcrosslinked'")
    print("-" * 40)

    project_name = input("Enter project name: ").strip()

    if not project_name:
        project_name = "hydrogel_AFM"
        print(f"‚ö†Ô∏è  No name entered. Using default: '{project_name}'")

    # Clean the name for filename compatibility
    project_name_clean = "".join(c for c in project_name if c.isalnum() or c in ('_', '-')).rstrip()

    print(f"\n‚úÖ Project name set to: '{project_name_clean}'")
    print(f"   All output files will start with: {project_name_clean}_*")

    return project_name_clean

# Get project name
project_name = get_project_name()

# %% [markdown]
# ## 3. Upload Your TSV Files (NEW SAMPLES)

# %%
def upload_files():
    """Upload TSV files to Colab - Fresh upload for new samples"""
    print(f"\nüì§ UPLOAD TSV FILES FOR '{project_name}' SAMPLES")
    print("-" * 50)
    print("Please upload your TSV files for the new hydrogel samples.")
    print("You can select multiple files at once.")
    print("If your files are in a zip archive, please upload the zip file.")
    print("\n‚ö†Ô∏è  Note: All previously uploaded data has been cleared.")

    uploaded = files.upload()

    if not uploaded:
        print("\n‚ùå No files uploaded. Please run this cell again to upload files.")
        return None

    # Create fresh directory with project name
    upload_dir = f'./uploaded_files_{project_name}'
    if os.path.exists(upload_dir):
        shutil.rmtree(upload_dir)
    os.makedirs(upload_dir, exist_ok=True)

    # Check if a zip file was uploaded
    zip_files = [f for f in uploaded.keys() if f.endswith('.zip')]

    if zip_files:
        print(f"\nüì¶ Extracting {len(zip_files)} zip file(s)...")
        for zip_file in zip_files:
            with zipfile.ZipFile(io.BytesIO(uploaded[zip_file]), 'r') as z:
                z.extractall(upload_dir)
        print("‚úÖ Extraction complete!")
    else:
        # Save individual files
        for filename, content in uploaded.items():
            with open(os.path.join(upload_dir, filename), 'wb') as f:
                f.write(content)
        print(f"\n‚úÖ Saved {len(uploaded)} file(s) to {upload_dir}/")

    return upload_dir

# Upload NEW files
upload_dir = upload_files()

# %% [markdown]
# ## 4. Set Output Directory (Same as Input Directory)

# %%
# Set the output directory to be the same as the input directory
if upload_dir:
    output_dir = upload_dir
    print(f"\nüìÅ OUTPUT DIRECTORY SET TO INPUT DIRECTORY:")
    print(f"   {output_dir}")
    print("=" * 60)
else:
    output_dir = "."
    print("‚ö†Ô∏è  Using current directory as output directory")

# %% [markdown]
# ## 5. Find and List TSV Files (NEW DATA)

# %%
def find_tsv_files(directory):
    """Find all TSV files in the directory"""
    tsv_files = glob.glob(os.path.join(directory, '*.tsv'))
    txt_files = glob.glob(os.path.join(directory, '*.txt'))

    all_files = tsv_files + txt_files

    if not all_files:
        print(f"‚ö†Ô∏è  No TSV or TXT files found in {directory}")
        print("üîç Checking subdirectories...")
        all_files = glob.glob(os.path.join(directory, '**', '*.tsv'), recursive=True)
        all_files += glob.glob(os.path.join(directory, '**', '*.txt'), recursive=True)

    return sorted(all_files)

if upload_dir:
    tsv_files = find_tsv_files(upload_dir)

    if tsv_files:
        print(f"‚úÖ Found {len(tsv_files)} TSV/TXT files for '{project_name}':")
        print("-" * 60)
        for i, filepath in enumerate(tsv_files[:15], 1):
            filename = os.path.basename(filepath)
            size_kb = os.path.getsize(filepath) / 1024
            print(f"{i:3d}. {filename:<50} ({size_kb:.1f} KB)")

        if len(tsv_files) > 15:
            print(f"... and {len(tsv_files) - 15} more files")

        print("-" * 60)

        # Show file details
        print("\nüìã FILE DETAILS:")
        sample_names = set()
        for filepath in tsv_files[:5]:  # Show first 5 files as sample
            with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
                first_line = f.readline().strip()
                if first_line:
                    sample_names.add(os.path.basename(filepath).split('_')[0] if '_' in os.path.basename(filepath) else os.path.splitext(os.path.basename(filepath))[0])

        if sample_names:
            print(f"Sample names detected: {', '.join(list(sample_names)[:5])}")
            if len(sample_names) > 5:
                print(f"... and {len(sample_names) - 5} more")
    else:
        print("‚ùå No TSV files found. Please check your upload.")
        print("   Make sure files have .tsv or .txt extensions.")

# %% [markdown]
# ## 6. Data Loading and Processing Functions

# %%
def load_tsv_file(filepath, sep='\t'):
    """Load a TSV file with proper error handling, trying multiple encodings and separators"""
    encodings_to_try = ['utf-8', 'latin-1', 'cp1252'] # Added more common encodings

    for encoding in encodings_to_try:
        try:
            df = pd.read_csv(filepath, sep=sep, encoding=encoding)

            # If the file has only one column, try different separators
            if df.shape[1] == 1:
                print(f"‚ö†Ô∏è  File {os.path.basename(filepath)} has only 1 column with {encoding} encoding. Trying different separators...")
                # Try comma separator
                df_comma = pd.read_csv(filepath, sep=',', encoding=encoding)
                if df_comma.shape[1] > 1:
                    print(f"  ‚úÖ Successfully read with comma separator using {encoding}")
                    df = df_comma
                else:
                    # Try semicolon
                    df_semicolon = pd.read_csv(filepath, sep=';', encoding=encoding)
                    if df_semicolon.shape[1] > 1:
                        print(f"  ‚úÖ Successfully read with semicolon separator using {encoding}")
                        df = df_semicolon
                    else:
                        # Try whitespace
                        df_whitespace = pd.read_csv(filepath, sep='\s+', encoding=encoding, engine='python')
                        if df_whitespace.shape[1] > 1:
                            print(f"  ‚úÖ Successfully read with whitespace separator using {encoding}")
                            df = df_whitespace
                        else:
                            # If still one column, this encoding/separator combination failed
                            continue # Try next encoding

            # If we reached here, df was successfully read (potentially with a different separator)
            df.columns = df.columns.str.strip() # Clean column names
            return df

        except UnicodeDecodeError:
            # print(f"Trying {encoding} failed for {os.path.basename(filepath)}. Trying next encoding...")
            continue # Try next encoding
        except Exception as e:
            print(f"‚ùå Non-decoding error reading {os.path.basename(filepath)} with {encoding}: {e}")
            break # Break from encoding loop if it's a pandas parsing error or other issue

    print(f"‚ùå Failed to read {os.path.basename(filepath)} with any attempted encoding or separator.")
    return None

def extract_sample_info(filename):
    """Extract sample information from filename"""
    # Remove extension
    name = os.path.splitext(filename)[0]

    # Try to extract information from filename patterns
    info = {
        'filename': filename,
        'sample_name': name,
        'date': '',
        'condition': '',
        'replicate': '',
        'hydrogel_type': '',
        'concentration': '',
        'crosslinking': ''
    }

    # Common patterns in hydrogel filenames
    import re

    # Look for hydrogel type patterns
    hydrogel_patterns = [
        r'(PEG[-\s_]?\w*)', r'(alginate)', r'(collagen)', r'(gelatin)',
        r'(HA|hyaluronic)', r'(chitosan)', r'(agarose)', r'(PAAm|polyacrylamide)',
        r'(PVA)', r'(PNIPAM)', r'(PHEMA)'
    ]

    for pattern in hydrogel_patterns:
        match = re.search(pattern, name, re.IGNORECASE)
        if match:
            info['hydrogel_type'] = match.group(1)
            break

    # Look for concentration patterns
    conc_patterns = [
        r'(\d+[\.]?\d*)[\s_]?%', r'(\d+[\.]?\d*)[\s_]?mg/ml',
        r'(\d+[\.]?\d*)[\s_]?mM', r'(\d+[\.]?\d*)[\s_]?wt'
    ]

    for pattern in conc_patterns:
        match = re.search(pattern, name, re.IGNORECASE)
        if match:
            info['concentration'] = match.group(1)
            break

    # Look for crosslinking patterns
    xlink_patterns = [
        r'(UV)', r'(thermal)', r'(chemical)', r'(enzymatic)',
        r'(CaCl2|calcium)', r'(EDC)', r'(NHS)', r'(genipin)'
    ]

    for pattern in xlink_patterns:
        match = re.search(pattern, name, re.IGNORECASE)
        if match:
            info['crosslinking'] = match.group(1)
            break

    # Look for date patterns
    date_patterns = [
        r'\d{4}-\d{2}-\d{2}',
        r'\d{8}',
        r'\d{2}-\d{2}-\d{4}'
    ]

    for pattern in date_patterns:
        match = re.search(pattern, name)
        if match:
            info['date'] = match.group()
            break

    # Look for replicate information
    rep_patterns = [
        r'_rep(\d+)', r'_r(\d+)', r'_(\d+)$', r'[_-]([a-zA-Z])$',
        r'replicate[\s_-]?(\d+)', r'sample[\s_-]?(\d+)'
    ]

    for pattern in rep_patterns:
        match = re.search(pattern, name, re.IGNORECASE)
        if match:
            info['replicate'] = match.group(1)
            break

    # Look for condition (e.g., pH, temperature)
    if 'ph' in name.lower():
        ph_match = re.search(r'[pP][hH][_-]?(\d+\.?\d*)', name)
        if ph_match:
            info['condition'] = f"pH {ph_match.group(1)}"
    elif 'temp' in name.lower() or '¬∞C' in name:
        temp_match = re.search(r'(\d+)[\s_]?¬∞?C', name, re.IGNORECASE)
        if temp_match:
            info['condition'] = f"{temp_match.group(1)}¬∞C"

    return info

# %% [markdown]
# ## 7. Process All Files and Create Fresh Excel Workbook (Saved in Input Directory)

# %%
def process_all_files(tsv_files, project_name, output_directory):
    """Process all TSV files and create a fresh Excel workbook"""

    if not tsv_files:
        print("‚ùå No files to process!")
        return None

    print(f"\nüîÑ PROCESSING {len(tsv_files)} FILES FOR '{project_name}'...")
    print("=" * 60)

    summary_data = []
    all_data_combined_dfs = [] # Renamed to clearly indicate it holds processed DataFrames

    processed_count = 0
    error_count = 0

    for filepath in tqdm(tsv_files, desc="Processing", unit="file"):
        filename = os.path.basename(filepath)
        sample_info = extract_sample_info(filename)
        df = load_tsv_file(filepath)

        if df is not None and not df.empty:
            processed_count += 1

            df['Source_Filename'] = filename

            for key, value in sample_info.items():
                if key != 'filename':
                    df[f'Sample_{key}'] = value

            summary_stats = {
                'Filename': filename,
                'Sample_Name': sample_info['sample_name'],
                'Hydrogel_Type': sample_info['hydrogel_type'],
                'Concentration': sample_info['concentration'],
                'Crosslinking': sample_info['crosslinking'],
                'Date': sample_info['date'],
                'Condition': sample_info['condition'],
                'Replicate': sample_info['replicate'],
                'Total_Curves': len(df),
                'Youngs_Modulus_Mean_Pa': df.get('Young\'s Modulus [Pa]', pd.Series([np.nan])).mean() if 'Young\'s Modulus [Pa]' in df.columns else np.nan,
                'Youngs_Modulus_Std_Pa': df.get('Young\'s Modulus [Pa]', pd.Series([np.nan])).std() if 'Young\'s Modulus [Pa]' in df.columns else np.nan,
                'Youngs_Modulus_Mean_kPa': (df.get('Young\'s Modulus [Pa]', pd.Series([np.nan])).mean() / 1000) if 'Young\'s Modulus [Pa]' in df.columns else np.nan,
                'Adhesion_Mean_N': df.get('Adhesion', pd.Series([np.nan])).mean() if 'Adhesion' in df.columns else np.nan,
                'Adhesion_Std_N': df.get('Adhesion', pd.Series([np.nan])).std() if 'Adhesion' in df.columns else np.nan,
                'Slope_Mean_N/m': df.get('Slope [N/m]', pd.Series([np.nan])).mean() if 'Slope [N/m]' in df.columns else np.nan,
                'Height_Mean_m': df.get('Interpolated Height [m]', pd.Series([np.nan])).mean() if 'Interpolated Height [m]' in df.columns else np.nan,
            }

            if 'Contact Point [m]' in df.columns:
                summary_stats['Contact_Point_Mean_m'] = df['Contact Point [m]'].mean()

            summary_data.append(summary_stats)
            all_data_combined_dfs.append(df) # Append the processed dataframe
        else:
            error_count += 1
            print(f"‚ö†Ô∏è  Skipped {filename} - could not read or empty file")

    print("\n" + "=" * 60)
    print(f"‚úÖ PROCESSING COMPLETE FOR '{project_name}'!")
    print(f"   Processed: {processed_count} files successfully")
    print(f"   Failed: {error_count} files")
    print("=" * 60)

    # --- Crucial change: Only create Excel writer if there's data to write --- BEGIN
    if processed_count == 0:
        print("‚ùå No files were successfully processed. No Excel file will be generated.")
        return None
    # --- Crucial change: Only create Excel writer if there's data to write --- END

    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    output_filename = f"{project_name}_AFM_Results_{timestamp}.xlsx"
    output_path = os.path.join(output_directory, output_filename)

    if os.path.exists(output_path):
        os.remove(output_path)
        print(f"üóëÔ∏è  Removed previous output file: {output_path}")

    with pd.ExcelWriter(output_path, engine='openpyxl') as writer:
        # Write individual sheets for each processed dataframe
        for i, df_to_write in enumerate(all_data_combined_dfs):
            sample_name = df_to_write['Sample_sample_name'].iloc[0] if 'Sample_sample_name' in df_to_write.columns else f"UnknownSample_{i+1}"
            sheet_name = sample_name[:31] # Excel limit for sheet names
            invalid_chars = '[]:*?/\\'
            for char in invalid_chars:
                sheet_name = sheet_name.replace(char, '_')
            if not sheet_name:
                sheet_name = f"Data_{i+1}"

            try:
                df_to_write.to_excel(writer, sheet_name=sheet_name, index=False)
            except Exception as e:
                sheet_name_unique = f"{sheet_name[:25]}_{i+1}"
                try:
                    df_to_write.to_excel(writer, sheet_name=sheet_name_unique, index=False)
                except:
                    sheet_name_unique = f"Data_{i+1}_Error" # Fallback
                    df_to_write.to_excel(writer, sheet_name=sheet_name_unique, index=False)

        summary_df = pd.DataFrame(summary_data)
        summary_df.to_excel(writer, sheet_name='Summary_Statistics', index=False)

        overall_stats = {
            'Statistic': ['Project Name', 'Total Files', 'Processed Files', 'Failed Files', 'Total Curves',
                        'Avg Youngs Modulus (kPa)', 'Avg Adhesion (nN)', 'Output Directory'],
            'Value': [
                project_name,
                len(tsv_files),
                processed_count,
                error_count,
                summary_df['Total_Curves'].sum(),
                summary_df['Youngs_Modulus_Mean_kPa'].mean(),
                summary_df['Adhesion_Mean_N'].mean() * 1e9,  # Convert to nN
                output_directory
            ]
        }
        overall_df = pd.DataFrame(overall_stats)
        overall_df.to_excel(writer, sheet_name='Overall_Stats', index=False)

        combined_df = pd.concat(all_data_combined_dfs, ignore_index=True)
        combined_df.to_excel(writer, sheet_name='All_Data_Combined', index=False)

        if not summary_df.empty: # Ensure summary_df is not empty before attempting analysis
            analysis_df = summary_df.copy()

            grouping_fields = []
            if analysis_df['Hydrogel_Type'].notna().any() and analysis_df['Hydrogel_Type'].nunique() > 1:
                grouping_fields.append('Hydrogel_Type')
            if analysis_df['Concentration'].notna().any() and analysis_df['Concentration'].nunique() > 1:
                grouping_fields.append('Concentration')
            if analysis_df['Condition'].notna().any() and analysis_df['Condition'].nunique() > 1:
                grouping_fields.append('Condition')

            if grouping_fields:
                analysis_stats = []
                for field in grouping_fields:
                    for group_value, group in analysis_df.groupby(field):
                        if pd.notna(group_value):  # Skip NaN groups
                            group_stats = {
                                'Grouping_Field': field,
                                'Group_Value': group_value,
                                'Number_of_Samples': len(group),
                                'Number_of_Curves': group['Total_Curves'].sum(),
                                'Youngs_Modulus_Mean_kPa': group['Youngs_Modulus_Mean_kPa'].mean(),
                                'Youngs_Modulus_SEM_kPa': group['Youngs_Modulus_Mean_kPa'].sem(),
                                'Adhesion_Mean_nN': (group['Adhesion_Mean_N'].mean() * 1e9) if group['Adhesion_Mean_N'].notna().any() else np.nan,
                                'Adhesion_SEM_nN': (group['Adhesion_Mean_N'].sem() * 1e9) if group['Adhesion_Mean_N'].notna().any() else np.nan
                            }
                            analysis_stats.append(group_stats)

                if analysis_stats:
                    analysis_df_grouped = pd.DataFrame(analysis_stats)
                    analysis_df_grouped.to_excel(writer, sheet_name='Group_Analysis', index=False)

    print(f"   Output file: {output_path}")

    # Show file size
    if os.path.exists(output_path):
        file_size_mb = os.path.getsize(output_path) / (1024 * 1024)
        print(f"   File size: {file_size_mb:.2f} MB")

    return output_path

# %% [markdown]
# ## 8. Execute Processing (NEW DATA)

# %%
if 'tsv_files' in locals() and tsv_files:
    output_file = process_all_files(tsv_files, project_name, output_dir)

    if output_file:
        print(f"\nüìä EXCEL FILE CONTENTS:")
        print("-" * 40)
        print("1. Individual sheets for each TSV file")
        print("2. 'Summary_Statistics' sheet with per-file statistics")
        print("3. 'Overall_Stats' sheet with summary metrics")
        print("4. 'All_Data_Combined' sheet with all data merged")
        print("5. 'Group_Analysis' sheet (if groups identified)")
        print("-" * 40)

        # Download the file
        print(f"\nüì• DOWNLOADING RESULTS FILE...")
        files.download(output_file)
else:
    print("‚ùå No files to process. Please upload files in section 2.")

# %% [markdown]
# ## 9. Preview New Data

# %%
def preview_data(filepath, n_rows=5):
    """Preview data from a TSV file"""
    df = load_tsv_file(filepath)
    if df is not None:
        print(f"üìÑ File: {os.path.basename(filepath)}")
        print(f"üìê Shape: {df.shape} (rows √ó columns)")
        print(f"üìã Columns ({len(df.columns)}):")

        # Group columns by type if possible
        force_cols = [col for col in df.columns if 'force' in col.lower() or 'adhesion' in col.lower() or '[N]' in col]
        mod_cols = [col for col in df.columns if 'modulus' in col.lower() or '[Pa]' in col]
        height_cols = [col for col in df.columns if 'height' in col.lower() or '[m]' in col]
        slope_cols = [col for col in df.columns if 'slope' in col.lower()]
        other_cols = [col for col in df.columns if col not in force_cols + mod_cols + height_cols + slope_cols]

        if force_cols:
            print(f"   Force-related: {', '.join(force_cols[:3])}")
            if len(force_cols) > 3:
                print(f"                 ... and {len(force_cols)-3} more")
        if mod_cols:
            print(f"   Modulus: {', '.join(mod_cols)}")
        if height_cols:
            print(f"   Height/Position: {', '.join(height_cols[:2])}")
            if len(height_cols) > 2:
                print(f"                   ... and {len(height_cols)-2} more")

        print(f"\nüìà First {n_rows} rows:")
        print(df.head(n_rows))
        return df
    return None

# Optional: Preview the first file
if 'tsv_files' in locals() and tsv_files:
    print("\n" + "=" * 60)
    print("üëÅÔ∏è  PREVIEW OF FIRST FILE:")
    print("=" * 60)
    preview_df = preview_data(tsv_files[0])

# %% [markdown]
# ## 10. Create INDIVIDUAL Visualization Plots (Saved in Input Directory)

# %%
def create_individual_plots(output_file, project_name, output_directory):
    """Create 6 individual visualization plots for hydrogel characterization"""
    try:
        # Read the combined data
        combined_df = pd.read_excel(output_file, sheet_name='All_Data_Combined')

        # Set style for better looking plots
        plt.style.use('seaborn-v0_8-darkgrid')

        # Get timestamp for filenames
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

        print(f"\nüìä CREATING INDIVIDUAL PLOTS FOR '{project_name}'...")
        print("=" * 60)

        plot_files = []  # Store all plot file paths

        # Plot 1: Young's Modulus distribution (in kPa)
        if 'Young\'s Modulus [Pa]' in combined_df.columns:
            print("1. Creating Young's Modulus distribution plot...")
            youngs_mod_kpa = combined_df['Young\'s Modulus [Pa]'].dropna() / 1000

            fig1, ax1 = plt.subplots(figsize=(10, 6))
            n, bins, patches = ax1.hist(youngs_mod_kpa, bins=30, alpha=0.7, color='steelblue',
                                       edgecolor='black', linewidth=0.5)
            mean_val = youngs_mod_kpa.mean()
            std_val = youngs_mod_kpa.std()

            ax1.axvline(mean_val, color='red', linestyle='--', linewidth=2,
                       label=f'Mean: {mean_val:.1f} kPa')
            ax1.axvline(mean_val + std_val, color='orange', linestyle=':', linewidth=1.5)
            ax1.axvline(mean_val - std_val, color='orange', linestyle=':', linewidth=1.5,
                       label=f'\u00b11 SD: {std_val:.1f} kPa')

            ax1.set_xlabel('Young\'s Modulus (kPa)', fontsize=12)
            ax1.set_ylabel('Frequency', fontsize=12)
            ax1.set_title(f'{project_name} - Young\'s Modulus Distribution\n'
                         f'N = {len(youngs_mod_kpa)} curves | Mean \u00b1 SD: {mean_val:.1f} \u00b1 {std_val:.1f} kPa',
                         fontsize=14, fontweight='bold')
            ax1.grid(True, alpha=0.3)
            ax1.legend()
            plt.tight_layout()

            # Save individual plot to output directory
            plot1_filename = f"{project_name}_Youngs_Modulus_Distribution_{timestamp}.png"
            plot1_path = os.path.join(output_directory, plot1_filename)
            plt.savefig(plot1_path, dpi=300, bbox_inches='tight')
            plt.show()
            print(f"   ‚úÖ Saved as: {plot1_path}")
            plot_files.append(plot1_path)

        # Plot 2: Adhesion distribution (in nN)
        if 'Adhesion' in combined_df.columns:
            print("\n2. Creating Adhesion force distribution plot...")
            adhesion_nN = combined_df['Adhesion'].dropna() * 1e9  # Convert to nN

            fig2, ax2 = plt.subplots(figsize=(10, 6))
            n, bins, patches = ax2.hist(adhesion_nN, bins=30, alpha=0.7, color='forestgreen',
                                       edgecolor='black', linewidth=0.5)
            mean_val = adhesion_nN.mean()
            std_val = adhesion_nN.std()

            ax2.axvline(mean_val, color='red', linestyle='--', linewidth=2,
                       label=f'Mean: {mean_val:.1f} nN')
            ax2.axvline(mean_val + std_val, color='orange', linestyle=':', linewidth=1.5)
            ax2.axvline(mean_val - std_val, color='orange', linestyle=':', linewidth=1.5,
                       label=f'\u00b11 SD: {std_val:.1f} nN')

            ax2.set_xlabel('Adhesion Force (nN)', fontsize=12)
            ax2.set_ylabel('Frequency', fontsize=12)
            ax2.set_title(f'{project_name} - Adhesion Force Distribution\n'
                         f'N = {len(adhesion_nN)} curves | Mean \u00b1 SD: {mean_val:.1f} \u00b1 {std_val:.1f} nN',
                         fontsize=14, fontweight='bold')
            ax2.grid(True, alpha=0.3)
            ax2.legend()
            plt.tight_layout()

            # Save individual plot to output directory
            plot2_filename = f"{project_name}_Adhesion_Distribution_{timestamp}.png"
            plot2_path = os.path.join(output_directory, plot2_filename)
            plt.savefig(plot2_path, dpi=300, bbox_inches='tight')
            plt.show()
            print(f"   ‚úÖ Saved as: {plot2_path}")
            plot_files.append(plot2_path)

        # Plot 3: Young's Modulus vs Adhesion scatter plot
        if 'Young\'s Modulus [Pa]' in combined_df.columns and 'Adhesion' in combined_df.columns:
            print("\n3. Creating Modulus vs Adhesion correlation plot...")
            youngs_mod_kpa = combined_df['Young\'s Modulus [Pa]'].dropna() / 1000
            adhesion_nN = combined_df['Adhesion'].dropna() * 1e9

            # Align data lengths
            min_len = min(len(youngs_mod_kpa), len(adhesion_nN))
            if min_len > 0:
                youngs_mod_kpa_aligned = youngs_mod_kpa.iloc[:min_len]
                adhesion_nN_aligned = adhesion_nN.iloc[:min_len]

                fig3, ax3 = plt.subplots(figsize=(10, 6))
                scatter = ax3.scatter(youngs_mod_kpa_aligned, adhesion_nN_aligned,
                                    alpha=0.6, s=50, c='purple', edgecolor='black', linewidth=0.5)

                # Add correlation line if sufficient points
                if len(youngs_mod_kpa_aligned) > 1:
                    z = np.polyfit(youngs_mod_kpa_aligned, adhesion_nN_aligned, 1)
                    p = np.poly1d(z)
                    ax3.plot(youngs_mod_kpa_aligned, p(youngs_mod_kpa_aligned),
                            "r--", alpha=0.8, linewidth=2,
                            label=f'Linear fit: y = {z[0]:.3f}x + {z[1]:.1f}')

                ax3.set_xlabel('Young\'s Modulus (kPa)', fontsize=12)
                ax3.set_ylabel('Adhesion Force (nN)', fontsize=12)
                ax3.set_title(f'{project_name} - Modulus vs Adhesion Correlation\n'
                             f'N = {min_len} data points',
                             fontsize=14, fontweight='bold')
                ax3.grid(True, alpha=0.3)
                ax3.legend()
                plt.tight_layout()

                # Save individual plot to output directory
                plot3_filename = f"{project_name}_Modulus_vs_Adhesion_{timestamp}.png"
                plot3_path = os.path.join(output_directory, plot3_filename)
                plt.savefig(plot3_path, dpi=300, bbox_inches='tight')
                plt.show()
                print(f"   ‚úÖ Saved as: {plot3_path}")
                plot_files.append(plot3_path)

        # Plot 4: Box plot of Young's Modulus by hydrogel type
        if 'Sample_hydrogel_type' in combined_df.columns and 'Young\'s Modulus [Pa]' in combined_df.columns:
            print("\n4. Creating Modulus by Hydrogel Type box plot...")
            hydrogel_types = combined_df['Sample_hydrogel_type'].dropna().unique()

            if len(hydrogel_types) > 0:
                data_to_plot = []
                labels = []

                for h_type in hydrogel_types[:8]:  # Limit to 8 types for clarity
                    mod_data = combined_df[combined_df['Sample_hydrogel_type'] == h_type]['Young\'s Modulus [Pa]'].dropna() / 1000
                    if len(mod_data) > 0:
                        data_to_plot.append(mod_data)
                        labels.append(f"{h_type[:15]}" if len(h_type) > 15 else h_type)

                if data_to_plot:
                    fig4, ax4 = plt.subplots(figsize=(12, 7))
                    box = ax4.boxplot(data_to_plot, patch_artist=True, showmeans=True,
                                     meanline=True, showfliers=True,
                                     medianprops=dict(color='black', linewidth=2),
                                     meanprops=dict(color='red', linestyle='--', linewidth=2))

                    # Color the boxes
                    colors = plt.cm.Set3(np.linspace(0, 1, len(data_to_plot)))
                    for patch, color in zip(box['boxes'], colors):
                        patch.set_facecolor(color)

                    ax4.set_xticklabels(labels, rotation=45, ha='right', fontsize=11)
                    ax4.set_ylabel('Young\'s Modulus (kPa)', fontsize=12)
                    ax4.set_title(f'{project_name} - Young\'s Modulus by Hydrogel Type\n'
                                 f'Red dashed line = mean | Black line = median',
                                 fontsize=14, fontweight='bold')
                    ax4.grid(True, alpha=0.3, axis='y')
                    plt.tight_layout()

                    # Save individual plot to output directory
                    plot4_filename = f"{project_name}_Modulus_by_Type_{timestamp}.png"
                    plot4_path = os.path.join(output_directory, plot4_filename)
                    plt.savefig(plot4_path, dpi=300, bbox_inches='tight')
                    plt.show()
                    print(f"   ‚úÖ Saved as: {plot4_path}")
                    plot_files.append(plot4_path)

        # Plot 5: Contact point distribution
        print("\n5. Creating Contact Point distribution plot...")
        contact_cols = [col for col in combined_df.columns if 'contact' in col.lower() and ('[m]' in col or 'point' in col.lower())]

        if contact_cols:
            contact_data = combined_df[contact_cols[0]].dropna() * 1e6  # Convert to ¬µm
            mean_val = contact_data.mean()
            std_val = contact_data.std()

            fig5, ax5 = plt.subplots(figsize=(10, 6))
            n, bins, patches = ax5.hist(contact_data, bins=30, alpha=0.7, color='darkorange',
                                       edgecolor='black', linewidth=0.5)

            ax5.axvline(mean_val, color='red', linestyle='--', linewidth=2,
                       label=f'Mean: {mean_val:.1f} ¬µm')
            ax5.axvline(mean_val + std_val, color='orange', linestyle=':', linewidth=1.5)
            ax5.axvline(mean_val - std_val, color='orange', linestyle=':', linewidth=1.5,
                       label=f'\u00b11 SD: {std_val:.1f} ¬µm')

            ax5.set_xlabel('Contact Point (¬µm)', fontsize=12)
            ax5.set_ylabel('Frequency', fontsize=12)
            ax5.set_title(f'{project_name} - Contact Point Distribution\n'
                         f'N = {len(contact_data)} curves | Mean \u00b1 SD: {mean_val:.1f} \u00b1 {std_val:.1f} ¬µm',
                         fontsize=14, fontweight='bold')
            ax5.grid(True, alpha=0.3)
            ax5.legend()
            plt.tight_layout()

            # Save individual plot to output directory
            plot5_filename = f"{project_name}_Contact_Point_Distribution_{timestamp}.png"
            plot5_path = os.path.join(output_directory, plot5_filename)
            plt.savefig(plot5_path, dpi=300, bbox_inches='tight')
            plt.show()
            print(f"   ‚úÖ Saved as: {plot5_path}")
            plot_files.append(plot5_path)
        else:
            print("   ‚ö†Ô∏è  Contact point data not found in the dataset")

        # Plot 6: Slope distribution
        print("\n6. Creating Slope distribution plot...")
        if 'Slope [N/m]' in combined_df.columns:
            slope_data = combined_df['Slope [N/m]'].dropna()
            mean_val = slope_data.mean()
            std_val = slope_data.std()

            fig6, ax6 = plt.subplots(figsize=(10, 6))
            n, bins, patches = ax6.hist(slope_data, bins=30, alpha=0.7, color='crimson',
                                       edgecolor='black', linewidth=0.5)

            ax6.axvline(mean_val, color='red', linestyle='--', linewidth=2,
                       label=f'Mean: {mean_val:.3f} N/m')
            ax6.axvline(mean_val + std_val, color='orange', linestyle=':', linewidth=1.5)
            ax6.axvline(mean_val - std_val, color='orange', linestyle=':', linewidth=1.5,
                       label=f'\u00b11 SD: {std_val:.3f} N/m')

            ax6.set_xlabel('Slope (N/m)', fontsize=12)
            ax6.set_ylabel('Frequency', fontsize=12)
            ax6.set_title(f'{project_name} - Slope Distribution\n'
                         f'N = {len(slope_data)} curves | Mean \u00b1 SD: {mean_val:.3f} \u00b1 {std_val:.3f} N/m',
                         fontsize=14, fontweight='bold')
            ax6.grid(True, alpha=0.3)
            ax6.legend()
            plt.tight_layout()

            # Save individual plot to output directory
            plot6_filename = f"{project_name}_Slope_Distribution_{timestamp}.png"
            plot6_path = os.path.join(output_directory, plot6_filename)
            plt.savefig(plot6_path, dpi=300, bbox_inches='tight')
            plt.show()
            print(f"   ‚úÖ Saved as: {plot6_path}")
            plot_files.append(plot6_path)
        else:
            print("   ‚ö†Ô∏è  Slope data not found in the dataset")

        # Plot 7: Combined overview plot (bonus)
        print("\n7. Creating Combined Overview plot...")
        fig7, ((ax7a, ax7b), (ax7c, ax7d)) = plt.subplots(2, 2, figsize=(14, 10))

        # Subplot 1: Young's Modulus
        if 'Young\'s Modulus [Pa]' in combined_df.columns:
            youngs_mod_kpa = combined_df['Young\'s Modulus [Pa]'].dropna() / 1000
            ax7a.hist(youngs_mod_kpa, bins=25, alpha=0.7, color='steelblue', edgecolor='black')
            ax7a.set_xlabel('Young\'s Modulus (kPa)')
            ax7a.set_ylabel('Frequency')
            ax7a.set_title('Young\'s Modulus')
            ax7a.grid(True, alpha=0.3)

        # Subplot 2: Adhesion
        if 'Adhesion' in combined_df.columns:
            adhesion_nN = combined_df['Adhesion'].dropna() * 1e9
            ax7b.hist(adhesion_nN, bins=25, alpha=0.7, color='forestgreen', edgecolor='black')
            ax7b.set_xlabel('Adhesion (nN)')
            ax7b.set_ylabel('Frequency')
            ax7b.set_title('Adhesion Force')
            ax7b.grid(True, alpha=0.3)

        # Subplot 3: Contact Point
        if contact_cols:
            contact_data = combined_df[contact_cols[0]].dropna() * 1e6
            ax7c.hist(contact_data, bins=25, alpha=0.7, color='darkorange', edgecolor='black')
            ax7c.set_xlabel('Contact Point (¬µm)')
            ax7c.set_ylabel('Frequency')
            ax7c.set_title('Contact Point')
            ax7c.grid(True, alpha=0.3)

        # Subplot 4: Slope
        if 'Slope [N/m]' in combined_df.columns:
            slope_data = combined_df['Slope [N/m]'].dropna()
            ax7d.hist(slope_data, bins=25, alpha=0.7, color='crimson', edgecolor='black')
            ax7d.set_xlabel('Slope (N/m)')
            ax7d.set_ylabel('Frequency')
            ax7d.set_title('Slope')
            ax7d.grid(True, alpha=0.3)

        plt.suptitle(f'{project_name} - AFM Force Spectroscopy Summary\nTotal Curves: {len(combined_df)}',
                    fontsize=16, fontweight='bold')
        plt.tight_layout()

        # Save combined plot to output directory
        plot7_filename = f"{project_name}_Combined_Overview_{timestamp}.png"
        plot7_path = os.path.join(output_directory, plot7_filename)
        plt.savefig(plot7_path, dpi=300, bbox_inches='tight')
        plt.show()
        print(f"   ‚úÖ Saved as: {plot7_path}")
        plot_files.append(plot7_path)

        print("\n" + "=" * 60)
        print(f"‚úÖ CREATED {len(plot_files)} PLOTS FOR '{project_name}'")
        print("=" * 60)

        # Download all plot files
        print("\nüì• DOWNLOADING ALL PLOT FILES...")
        for plot_file in plot_files:
            if os.path.exists(plot_file):
                files.download(plot_file)
                print(f"   Downloaded: {plot_file}")

        return plot_files

    except Exception as e:
        print(f"‚ùå Could not create plots: {e}")
        import traceback
        traceback.print_exc()
        return None

# Create plots if data is available
if 'output_file' in locals() and output_file and os.path.exists(output_file):
    print("\n" + "=" * 60)
    print("üìä CREATING VISUALIZATIONS...")
    print("=" * 60)
    plot_files = create_individual_plots(output_file, project_name, output_dir)
    if plot_files:
        print(f"\nüéâ All plots have been created and downloaded!")
        print(f"   Look for files starting with: {project_name}_*.png")

# %% [markdown]
# ## 11. Optional: Create a ZIP file with all results (Saved in Input Directory)

# %%
def create_results_zip(project_name, output_directory):
    """Create a ZIP file containing all output files"""
    try:
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        zip_filename = f"{project_name}_AFM_Complete_Results_{timestamp}.zip"
        zip_path = os.path.join(output_directory, zip_filename)

        # Find all files related to this project in the output directory
        all_files = []

        # Add Excel file
        excel_files = glob.glob(os.path.join(output_directory, f"{project_name}_AFM_Results_*.xlsx"))
        all_files.extend(excel_files)

        # Add plot files
        plot_files = glob.glob(os.path.join(output_directory, f"{project_name}_*.png"))
        all_files.extend(plot_files)

        if not all_files:
            print(f"‚ö†Ô∏è  No output files found for project '{project_name}' in {output_directory}")
            return None

        # Create ZIP file
        with zipfile.ZipFile(zip_path, 'w') as zipf:
            for file in all_files:
                if os.path.exists(file):
                    # Store files with their basename (without directory) in the zip
                    zipf.write(file, os.path.basename(file))

        print(f"\nüì¶ Created ZIP file: {zip_path}")
        print(f"   Contains {len(all_files)} files:")
        for file in all_files:
            print(f"   ‚Ä¢ {os.path.basename(file)}")

        # Download the ZIP file
        files.download(zip_path)

        return zip_path

    except Exception as e:
        print(f"‚ùå Error creating ZIP file: {e}")
        return None

# Optional: Create ZIP file
if 'project_name' in locals() and project_name:
    print("\n" + "=" * 60)
    print("üì¶ OPTIONAL: CREATE COMPLETE RESULTS PACKAGE")
    print("=" * 60)

    create_zip = input("Create a ZIP file with all results? (yes/no): ").strip().lower()

    if create_zip == 'yes':
        zip_file = create_results_zip(project_name, output_dir)
        if zip_file:
            print(f"\n‚úÖ Complete results package ready!")
    else:
        print("Skipping ZIP creation.")

# %% [markdown]
# ## 12. Show Directory Contents

# %%
def show_directory_contents(directory):
    """Show contents of the directory"""
    print(f"\nüìÅ CONTENTS OF DIRECTORY: {directory}")
    print("=" * 60)

    if os.path.exists(directory):
        items = os.listdir(directory)
        if items:
            # Separate files and directories
            files = []
            dirs = []

            for item in items:
                item_path = os.path.join(directory, item)
                if os.path.isfile(item_path):
                    files.append(item_path)
                else:
                    dirs.append(item_path)

            # Show directories first
            if dirs:
                print("üìÇ DIRECTORIES:")
                for i, dir_path in enumerate(sorted(dirs), 1):
                    dir_name = os.path.basename(dir_path)
                    print(f"  {i:3d}. üìÅ {dir_name}/")

            # Show files
            if files:
                print("\nüìÑ FILES:")
                for i, filepath in enumerate(sorted(files), 1):
                    filename = os.path.basename(filepath)
                    size_bytes = os.path.getsize(filepath)

                    # Convert size to appropriate unit
                    if size_bytes < 1024:
                        size_str = f"{size_bytes} B"
                    elif size_bytes < 1024*1024:
                        size_str = f"{size_bytes/1024:.1f} KB"
                    elif size_bytes < 1024*1024*1024:
                        size_str = f"{size_bytes/(1024*1024):.1f} MB"
                    else:
                        size_str = f"{size_bytes/(1024*1024*1024):.1f} GB"

                    # Color code by file type
                    if filename.endswith('.xlsx'):
                        icon = "üìä"
                    elif filename.endswith('.png'):
                        icon = "üñºÔ∏è"
                    elif filename.endswith('.zip'):
                        icon = "üì¶"
                    elif filename.endswith('.tsv') or filename.endswith('.txt'):
                        icon = "üìÑ"
                    else:
                        icon = "üìÑ"

                    print(f"  {i:3d}. {icon} {filename:<50} ({size_str})")
        else:
            print("   (Empty directory)")
    else:
        print("   (Directory does not exist)")

    print("=" * 60)

# Show contents of the output directory
if 'output_dir' in locals():
    show_directory_contents(output_dir)

# %% [markdown]
# ## 13. CLEANUP OPTION (Manual)

# %%
def manual_cleanup():
    """Manual cleanup function if needed"""
    print("üîÑ MANUAL CLEANUP OPTION")
    print("-" * 40)
    print("This will remove all generated files and start fresh.")
    print("Files to be removed:")

    files_to_remove = []
    for pattern in ['*.xlsx', '*.png', '*.pdf', 'AFM_*', f'{project_name}_*', 'Hydrogel_*']:
        files_to_remove.extend(glob.glob(pattern))

    if files_to_remove:
        for filepath in files_to_remove:
            print(f"  ‚Ä¢ {os.path.basename(filepath)}")

        confirm = input("\nAre you sure? (yes/no): ")
        if confirm.lower() == 'yes':
            for filepath in files_to_remove:
                try:
                    os.remove(filepath)
                except:
                    pass
            print("‚úÖ Cleanup complete!")
        else:
            print("‚ùå Cleanup cancelled.")
    else:
        print("‚úÖ No files to clean up.")

    # Also clean directories
    for dir_path in [f'uploaded_files_{project_name}', 'uploaded_files']:
        if os.path.exists(dir_path):
            try:
                shutil.rmtree(dir_path)
                print(f"‚úÖ Removed directory: {dir_path}")
            except:
                pass

# Uncomment to enable manual cleanup button
print("\n" + "=" * 60)
print("To manually clean up files and start over, run:")
print("manual_cleanup()")
print("=" * 60)

print("\n" + "=" * 60)
print("üéâ ANALYSIS COMPLETE!")
print(f"All results for '{project_name}' have been generated and saved to:")
print(f"  {output_dir}")
print("=" * 60)