In [8]:
import pandas as pd
import numpy as np
import os
import glob
from pathlib import Path

# Define the source directories
den_graphs_dir = "/Users/rezadoobary/Documents/MLCORRELATORS/ML-correlator/Tree_classifier_for_graphs/new_stuff/features/den_graphs"
motif_features_dir = "/Users/rezadoobary/Documents/MLCORRELATORS/ML-correlator/Tree_classifier_for_graphs/new_stuff/features/motif_features_den"
output_dir = "/Users/rezadoobary/Documents/MLCORRELATORS/ML-correlator/Tree_classifier_for_graphs/new_stuff/features/merged/new2_merged"

print("Source directories:")
print(f"den_graphs: {den_graphs_dir}")
print(f"motif_features: {motif_features_dir}")  
print(f"output: {output_dir}")

# List files in both directories
den_files = os.listdir(den_graphs_dir)
motif_files = os.listdir(motif_features_dir)

print(f"\nFiles in den_graphs: {den_files}")
print(f"Files in motif_features: {motif_files}")


Source directories:
den_graphs: /Users/rezadoobary/Documents/MLCORRELATORS/ML-correlator/Tree_classifier_for_graphs/new_stuff/features/den_graphs
motif_features: /Users/rezadoobary/Documents/MLCORRELATORS/ML-correlator/Tree_classifier_for_graphs/new_stuff/features/motif_features_den
output: /Users/rezadoobary/Documents/MLCORRELATORS/ML-correlator/Tree_classifier_for_graphs/new_stuff/features/merged/new2_merged

Files in den_graphs: ['6loopfeats_enhanced.csv', '8loopfeats_enhanced.csv', '11loopfeats_enhanced.csv', '5loopfeats_enhanced.csv', '7loopfeats_enhanced.csv', '9loopfeats_enhanced.csv', '10loopfeats_enhanced.csv']
Files in motif_features: ['9loops.csv', '8loops.csv', '11loops.csv', '10loops.csv', '9loops_manifest.json', '5loops.csv', '7loops_manifest.json', '6loops_manifest.json', '8loops_manifest.json', '5loops_manifest.json', '10loops_manifest.json', '7loops.csv', '6loops.csv']


In [9]:
# Analyze file naming patterns and identify matching files
def extract_loop_number(filename):
    """Extract loop number from filename"""
    if '5loop' in filename:
        return 5
    elif '6loop' in filename:
        return 6
    elif '7loop' in filename:
        return 7
    elif '8loop' in filename:
        return 8
    elif '9loop' in filename:
        return 9
    elif '10loop' in filename:
        return 10
    elif '11loop' in filename:
        return 11
    return None

# Create mapping of loop numbers to files
den_file_map = {}
motif_file_map = {}

for file in den_files:
    if file.endswith('.csv'):
        loop_num = extract_loop_number(file)
        if loop_num:
            den_file_map[loop_num] = file

for file in motif_files:
    if file.endswith('.csv'):
        loop_num = extract_loop_number(file)
        if loop_num:
            motif_file_map[loop_num] = file

print("File mapping:")
print(f"den_graphs files: {den_file_map}")
print(f"motif_features files: {motif_file_map}")

# Find common loop numbers
common_loops = set(den_file_map.keys()) & set(motif_file_map.keys())
print(f"\nCommon loop numbers to process: {sorted(common_loops)}")


File mapping:
den_graphs files: {6: '6loopfeats_enhanced.csv', 8: '8loopfeats_enhanced.csv', 11: '11loopfeats_enhanced.csv', 5: '5loopfeats_enhanced.csv', 7: '7loopfeats_enhanced.csv', 9: '9loopfeats_enhanced.csv', 10: '10loopfeats_enhanced.csv'}
motif_features files: {9: '9loops.csv', 8: '8loops.csv', 11: '11loops.csv', 10: '10loops.csv', 5: '5loops.csv', 7: '7loops.csv', 6: '6loops.csv'}

Common loop numbers to process: [5, 6, 7, 8, 9, 10, 11]


In [10]:
# First, scan all files to determine the complete set of columns to ensure consistency
def determine_column_sets(den_graphs_dir, motif_features_dir, den_file_map, motif_file_map, common_loops):
    """
    Scan all files to determine the complete set of columns that should be in all merged files.
    Returns the union of all columns from all files (keeps all columns, even with nulls).
    """
    print("=" * 60)
    print("SCANNING ALL FILES TO DETERMINE COMPLETE COLUMN STRUCTURE")
    print("=" * 60)
    
    all_den_columns = {}
    all_motif_columns = {}
    
    # Collect all columns from all files
    for loop_num in sorted(common_loops):
        den_file = os.path.join(den_graphs_dir, den_file_map[loop_num])
        motif_file = os.path.join(motif_features_dir, motif_file_map[loop_num])
        
        den_df = pd.read_csv(den_file)
        motif_df = pd.read_csv(motif_file)
        
        all_den_columns[loop_num] = set(den_df.columns)
        all_motif_columns[loop_num] = set(motif_df.columns)
    
    # Get union of all columns (columns that appear in ANY file)
    den_cols_union = set.union(*all_den_columns.values())
    motif_cols_union = set.union(*all_motif_columns.values())
    
    # Get intersection (columns that appear in ALL files)
    den_cols_intersection = set.intersection(*all_den_columns.values())
    motif_cols_intersection = set.intersection(*all_motif_columns.values())
    
    print(f"\nDen graphs columns:")
    print(f"  Columns in ALL files: {len(den_cols_intersection)}")
    print(f"  Total unique columns (union): {len(den_cols_union)}")
    print(f"  Columns missing in some files: {len(den_cols_union - den_cols_intersection)}")
    
    print(f"\nMotif features columns:")
    print(f"  Columns in ALL files: {len(motif_cols_intersection)}")
    print(f"  Total unique columns (union): {len(motif_cols_union)}")
    print(f"  Columns missing in some files: {len(motif_cols_union - motif_cols_intersection)}")
    
    # Report which files have which columns
    print("\nPer-file column count:")
    for loop_num in sorted(common_loops):
        print(f"  Loop {loop_num}: den={len(all_den_columns[loop_num])} cols, motif={len(all_motif_columns[loop_num])} cols")
    
    return den_cols_union, motif_cols_union, all_den_columns, all_motif_columns

# Function to concatenate CSV files and ensure consistent column structure
def merge_csv_files(den_file_path, motif_file_path, output_path, den_cols_union=None, motif_cols_union=None):
    """
    Concatenate two CSV files and ensure all output files have identical column structure.
    Missing columns are added with NaN values to ensure consistency.
    
    Args:
        den_file_path: Path to den_graphs CSV file
        motif_file_path: Path to motif_features CSV file
        output_path: Path to save merged CSV file
        den_cols_union: Set of all columns that should be in den_graphs (from all files)
        motif_cols_union: Set of all columns that should be in motif_features (from all files)
    """
    print(f"\nProcessing: {os.path.basename(den_file_path)} + {os.path.basename(motif_file_path)}")
    
    # Read the CSV files
    den_df = pd.read_csv(den_file_path)
    motif_df = pd.read_csv(motif_file_path)
    
    print(f"den_graphs shape: {den_df.shape}")
    print(f"motif_features shape: {motif_df.shape}")
    
    # Ensure all files have the same columns in the same order
    if den_cols_union is not None:
        missing_den_cols = den_cols_union - set(den_df.columns)
        if missing_den_cols:
            print(f"Adding {len(missing_den_cols)} missing columns to den_graphs (filled with NaN)")
            for col in missing_den_cols:
                den_df[col] = np.nan
        # Reorder to match sorted union for consistency
        den_df = den_df[sorted(den_cols_union)]
    
    # Ensure all files have the same columns in the same order
    if motif_cols_union is not None:
        missing_motif_cols = motif_cols_union - set(motif_df.columns)
        if missing_motif_cols:
            print(f"Adding {len(missing_motif_cols)} missing columns to motif_features (filled with NaN)")
            for col in missing_motif_cols:
                motif_df[col] = np.nan
        # Reorder to match sorted union for consistency
        motif_df = motif_df[sorted(motif_cols_union)]
    
    print(f"After ensuring complete columns - den_graphs shape: {den_df.shape}")
    print(f"After ensuring complete columns - motif_features shape: {motif_df.shape}")
    
    # Find common columns
    common_cols = set(den_df.columns) & set(motif_df.columns)
    print(f"Common columns: {len(common_cols)}")
    
    # Remove duplicated columns from motif_df (keep den_df version)
    motif_df_unique = motif_df.drop(columns=list(common_cols))
    print(f"After removing duplicated columns - motif_features shape: {motif_df_unique.shape}")
    
    # Concatenate the dataframes
    merged_df = pd.concat([den_df, motif_df_unique], axis=1)
    
    print(f"Concatenated shape: {merged_df.shape}")
    print(f"Total columns: {len(merged_df.columns)}")
    
    # Save the merged file
    merged_df.to_csv(output_path, index=False)
    print(f"Saved merged file to: {output_path}")
    
    return merged_df.columns.tolist()  # Return column list for verification

# First, determine the complete set of columns from all files
den_cols_union, motif_cols_union, all_den_columns, all_motif_columns = determine_column_sets(
    den_graphs_dir, motif_features_dir, den_file_map, motif_file_map, common_loops
)

# Test with one file pair first
if common_loops:
    test_loop = min(common_loops)
    den_file = os.path.join(den_graphs_dir, den_file_map[test_loop])
    motif_file = os.path.join(motif_features_dir, motif_file_map[test_loop])
    output_file = os.path.join(output_dir, f"{test_loop}loops_merged.csv")
    
    print(f"\nTesting merge with loop {test_loop}:")
    test_columns = merge_csv_files(den_file, motif_file, output_file, den_cols_union, motif_cols_union)
    
    if test_columns:
        print(f"Test merge successful! Total columns: {len(test_columns)}")
    else:
        print("Test merge failed!")


SCANNING ALL FILES TO DETERMINE COMPLETE COLUMN STRUCTURE

Den graphs columns:
  Columns in ALL files: 178
  Total unique columns (union): 179
  Columns missing in some files: 1

Motif features columns:
  Columns in ALL files: 101
  Total unique columns (union): 101
  Columns missing in some files: 0

Per-file column count:
  Loop 5: den=178 cols, motif=101 cols
  Loop 6: den=178 cols, motif=101 cols
  Loop 7: den=179 cols, motif=101 cols
  Loop 8: den=179 cols, motif=101 cols
  Loop 9: den=179 cols, motif=101 cols
  Loop 10: den=179 cols, motif=101 cols
  Loop 11: den=179 cols, motif=101 cols

Testing merge with loop 5:

Processing: 5loopfeats_enhanced.csv + 5loops.csv
den_graphs shape: (7, 178)
motif_features shape: (7, 101)
Adding 1 missing columns to den_graphs (filled with NaN)
After ensuring complete columns - den_graphs shape: (7, 179)
After ensuring complete columns - motif_features shape: (7, 101)
Common columns: 35
After removing duplicated columns - motif_features shape: (7,

In [11]:
df = pd.read_csv('/Users/rezadoobary/Documents/MLCORRELATORS/ML-correlator/Tree_classifier_for_graphs/new_stuff/features/merged/new2_merged/5loops_merged.csv')

In [13]:
# Process all loop files and merge them
print("=" * 60)
print("PROCESSING ALL LOOP FILES")
print("=" * 60)

successful_merges = []
failed_merges = []
all_merged_columns = {}  # Track columns in each merged file

for loop_num in sorted(common_loops):
    print(f"\n{'='*40}")
    print(f"Processing {loop_num}-loop files")
    print(f"{'='*40}")
    
    den_file = os.path.join(den_graphs_dir, den_file_map[loop_num])
    motif_file = os.path.join(motif_features_dir, motif_file_map[loop_num])
    output_file = os.path.join(output_dir, f"{loop_num}loops_merged.csv")
    
    try:
        merged_columns = merge_csv_files(den_file, motif_file, output_file, den_cols_union, motif_cols_union)
        if merged_columns:
            successful_merges.append(loop_num)
            all_merged_columns[loop_num] = set(merged_columns)
            print(f"✅ Successfully merged {loop_num}-loop files ({len(merged_columns)} columns)")
        else:
            failed_merges.append(loop_num)
            print(f"❌ Failed to merge {loop_num}-loop files")
    except Exception as e:
        failed_merges.append(loop_num)
        print(f"❌ Error processing {loop_num}-loop files: {str(e)}")

# Verify column consistency
print(f"\n{'='*60}")
print("COLUMN CONSISTENCY VERIFICATION")
print(f"{'='*60}")
if all_merged_columns:
    # Get union of all columns
    all_cols_union = set.union(*all_merged_columns.values())
    # Get intersection of all columns (columns present in all files)
    all_cols_intersection = set.intersection(*all_merged_columns.values())
    
    print(f"Columns present in ALL files: {len(all_cols_intersection)}")
    print(f"Total unique columns across all files: {len(all_cols_union)}")
    
    # Check for columns missing in some files
    inconsistent_cols = {}
    for loop_num, cols in all_merged_columns.items():
        missing = all_cols_union - cols
        if missing:
            inconsistent_cols[loop_num] = missing
    
    if inconsistent_cols:
        print(f"\n⚠️  WARNING: Found inconsistent columns across files:")
        for loop_num, missing_cols in inconsistent_cols.items():
            print(f"  {loop_num}-loop file missing {len(missing_cols)} columns: {sorted(list(missing_cols))[:5]}")
            if len(missing_cols) > 5:
                print(f"    ... and {len(missing_cols) - 5} more")
    else:
        print("\n✅ All files have identical column structures!")

print(f"\n{'='*60}")
print("SUMMARY")
print(f"{'='*60}")
print(f"Successfully merged: {successful_merges}")
print(f"Failed merges: {failed_merges}")
print(f"Total processed: {len(common_loops)}")
print(f"Success rate: {len(successful_merges)}/{len(common_loops)} ({len(successful_merges)/len(common_loops)*100:.1f}%)")


PROCESSING ALL LOOP FILES

Processing 5-loop files

Processing: 5loopfeats_enhanced.csv + 5loops.csv
den_graphs shape: (7, 178)
motif_features shape: (7, 101)
Adding 1 missing columns to den_graphs (filled with NaN)
After ensuring complete columns - den_graphs shape: (7, 179)
After ensuring complete columns - motif_features shape: (7, 101)
Common columns: 35
After removing duplicated columns - motif_features shape: (7, 66)
Concatenated shape: (7, 245)
Total columns: 245
Saved merged file to: /Users/rezadoobary/Documents/MLCORRELATORS/ML-correlator/Tree_classifier_for_graphs/new_stuff/features/merged/new2_merged/5loops_merged.csv
✅ Successfully merged 5-loop files (245 columns)

Processing 6-loop files

Processing: 6loopfeats_enhanced.csv + 6loops.csv
den_graphs shape: (31, 178)
motif_features shape: (31, 101)
Adding 1 missing columns to den_graphs (filled with NaN)
After ensuring complete columns - den_graphs shape: (31, 179)
After ensuring complete columns - motif_features shape: (31,

In [15]:
# Verify the merged files were created
print("=" * 60)
print("VERIFICATION")
print("=" * 60)

# List files in the output directory
output_files = os.listdir(output_dir)
csv_files = [f for f in output_files if f.endswith('.csv')]

print(f"Files created in {output_dir}:")
for file in sorted(csv_files):
    file_path = os.path.join(output_dir, file)
    file_size = os.path.getsize(file_path)
    print(f"  - {file} ({file_size:,} bytes)")

print(f"\nTotal merged files created: {len(csv_files)}")

# Show a sample of one merged file
if csv_files:
    sample_file = os.path.join(output_dir, csv_files[0])
    sample_df = pd.read_csv(sample_file)
    print(f"\nSample of {csv_files[0]}:")
    print(f"Shape: {sample_df.shape}")
    print(f"Columns: {len(sample_df.columns)}")
    print(f"First few column names: {list(sample_df.columns[:10])}")
    print(f"First few rows:")
    print(sample_df.head(2))


VERIFICATION
Files created in /Users/rezadoobary/Documents/MLCORRELATORS/ML-correlator/Tree_classifier_for_graphs/new_stuff/features/merged/new2_merged:
  - 10loops_merged.csv (445,720,585 bytes)
  - 11loops_merged.csv (4,936,597,498 bytes)
  - 5loops_merged.csv (23,910 bytes)
  - 6loops_merged.csv (85,329 bytes)
  - 7loops_merged.csv (446,806 bytes)
  - 8loops_merged.csv (4,037,792 bytes)
  - 9loops_merged.csv (40,296,068 bytes)

Total merged files created: 7

Sample of 5loops_merged.csv:
Shape: (7, 245)
Columns: 245
First few column names: ['Adjacency_energy', 'Adjacency_energy_over_fro', 'Adjacency_energy_per_node', 'Adjacency_estrada_index', 'Adjacency_estrada_per_node', 'Adjacency_moment_2', 'Adjacency_moment_2_over_avgdeg', 'Adjacency_moment_3', 'Adjacency_moment_3_over_avgdeg3', 'Adjacency_moment_4']
First few rows:
   Adjacency_energy  Adjacency_energy_over_fro  Adjacency_energy_per_node  \
0         17.059978                   2.632412                   1.895553   
1         1