In [1]:
import pandas as pd
import os
import glob
from pathlib import Path

# Define the source directories
den_graphs_dir = "/Users/rezadoobary/Documents/MLCORRELATORS/ML-correlator/Tree_classifier_for_graphs/new_stuff/features/den_graphs"
motif_features_dir = "/Users/rezadoobary/Documents/MLCORRELATORS/ML-correlator/Tree_classifier_for_graphs/new_stuff/features/motif_features_den"
output_dir = "/Users/rezadoobary/Documents/MLCORRELATORS/ML-correlator/Tree_classifier_for_graphs/new_stuff/features/merged/new_merged"

print("Source directories:")
print(f"den_graphs: {den_graphs_dir}")
print(f"motif_features: {motif_features_dir}")
print(f"output: {output_dir}")

# List files in both directories
den_files = os.listdir(den_graphs_dir)
motif_files = os.listdir(motif_features_dir)

print(f"\nFiles in den_graphs: {den_files}")
print(f"Files in motif_features: {motif_files}")


Source directories:
den_graphs: /Users/rezadoobary/Documents/MLCORRELATORS/ML-correlator/Tree_classifier_for_graphs/new_stuff/features/den_graphs
motif_features: /Users/rezadoobary/Documents/MLCORRELATORS/ML-correlator/Tree_classifier_for_graphs/new_stuff/features/motif_features_den
output: /Users/rezadoobary/Documents/MLCORRELATORS/ML-correlator/Tree_classifier_for_graphs/new_stuff/features/merged/new_merged

Files in den_graphs: ['6loopfeats_enhanced.csv', '8loopfeats_enhanced.csv', '11loopfeats_enhanced.csv', '5loopfeats_enhanced.csv', '7loopfeats_enhanced.csv', '9loopfeats_enhanced.csv', '10loopfeats_enhanced.csv']
Files in motif_features: ['9loops.csv', '8loops.csv', '11loops.csv', '10loops.csv', '9loops_manifest.json', '5loops.csv', '7loops_manifest.json', '6loops_manifest.json', '8loops_manifest.json', '5loops_manifest.json', '10loops_manifest.json', '7loops.csv', '6loops.csv']


In [2]:
# Analyze file naming patterns and identify matching files
def extract_loop_number(filename):
    """Extract loop number from filename"""
    if '5loop' in filename:
        return 5
    elif '6loop' in filename:
        return 6
    elif '7loop' in filename:
        return 7
    elif '8loop' in filename:
        return 8
    elif '9loop' in filename:
        return 9
    elif '10loop' in filename:
        return 10
    elif '11loop' in filename:
        return 11
    return None

# Create mapping of loop numbers to files
den_file_map = {}
motif_file_map = {}

for file in den_files:
    if file.endswith('.csv'):
        loop_num = extract_loop_number(file)
        if loop_num:
            den_file_map[loop_num] = file

for file in motif_files:
    if file.endswith('.csv'):
        loop_num = extract_loop_number(file)
        if loop_num:
            motif_file_map[loop_num] = file

print("File mapping:")
print(f"den_graphs files: {den_file_map}")
print(f"motif_features files: {motif_file_map}")

# Find common loop numbers
common_loops = set(den_file_map.keys()) & set(motif_file_map.keys())
print(f"\nCommon loop numbers to process: {sorted(common_loops)}")


File mapping:
den_graphs files: {6: '6loopfeats_enhanced.csv', 8: '8loopfeats_enhanced.csv', 11: '11loopfeats_enhanced.csv', 5: '5loopfeats_enhanced.csv', 7: '7loopfeats_enhanced.csv', 9: '9loopfeats_enhanced.csv', 10: '10loopfeats_enhanced.csv'}
motif_features files: {9: '9loops.csv', 8: '8loops.csv', 11: '11loops.csv', 10: '10loops.csv', 5: '5loops.csv', 7: '7loops.csv', 6: '6loops.csv'}

Common loop numbers to process: [5, 6, 7, 8, 9, 10, 11]


In [7]:
# Function to concatenate CSV files and remove duplicated columns
def merge_csv_files(den_file_path, motif_file_path, output_path):
    """
    Concatenate two CSV files and remove duplicated columns.
    Columns with null/nan values will be ignored.
    """
    print(f"\nProcessing: {os.path.basename(den_file_path)} + {os.path.basename(motif_file_path)}")
    
    # Read the CSV files
    den_df = pd.read_csv(den_file_path)
    motif_df = pd.read_csv(motif_file_path)
    
    print(f"den_graphs shape: {den_df.shape}")
    print(f"motif_features shape: {motif_df.shape}")
    
    # Remove columns with any null/nan values from both dataframes
    print("Checking for columns with null/nan values...")
    
    # For den_df
    den_cols_with_nulls = den_df.columns[den_df.isnull().any()].tolist()
    if den_cols_with_nulls:
        print(f"Removing {len(den_cols_with_nulls)} columns with nulls from den_graphs: {den_cols_with_nulls}")
        den_df = den_df.drop(columns=den_cols_with_nulls)
    
    # For motif_df
    motif_cols_with_nulls = motif_df.columns[motif_df.isnull().any()].tolist()
    if motif_cols_with_nulls:
        print(f"Removing {len(motif_cols_with_nulls)} columns with nulls from motif_features: {motif_cols_with_nulls}")
        motif_df = motif_df.drop(columns=motif_cols_with_nulls)
    
    print(f"After removing null columns - den_graphs shape: {den_df.shape}")
    print(f"After removing null columns - motif_features shape: {motif_df.shape}")
    
    # Find common columns
    common_cols = set(den_df.columns) & set(motif_df.columns)
    print(f"Common columns: {len(common_cols)}")
    print(f"Common column names: {sorted(list(common_cols))}")
    
    # Remove duplicated columns from motif_df (keep den_df version)
    motif_df_unique = motif_df.drop(columns=list(common_cols))
    print(f"After removing duplicated columns - motif_features shape: {motif_df_unique.shape}")
    
    # Concatenate the dataframes
    merged_df = pd.concat([den_df, motif_df_unique], axis=1)
    
    print(f"Concatenated shape: {merged_df.shape}")
    
    # Save the merged file
    merged_df.to_csv(output_path, index=False)
    print(f"Saved merged file to: {output_path}")
    
    return True

# Test with one file pair first
if common_loops:
    test_loop = min(common_loops)
    den_file = os.path.join(den_graphs_dir, den_file_map[test_loop])
    motif_file = os.path.join(motif_features_dir, motif_file_map[test_loop])
    output_file = os.path.join(output_dir, f"{test_loop}loops_merged.csv")
    
    print(f"\nTesting merge with loop {test_loop}:")
    success = merge_csv_files(den_file, motif_file, output_file)
    
    if success:
        print("Test merge successful!")
    else:
        print("Test merge failed!")



Testing merge with loop 5:

Processing: 5loopfeats_enhanced.csv + 5loops.csv
den_graphs shape: (7, 178)
motif_features shape: (7, 101)
Checking for columns with null/nan values...
Removing 5 columns with nulls from den_graphs: ['Basic_degree_skew', 'Assortativity_degree', 'Centrality_closeness_skew', 'Centrality_eigenvector_skew', 'Spectral_lap_eig_9']
Removing 1 columns with nulls from motif_features: ['Spectral_lap_eig_9']
After removing null columns - den_graphs shape: (7, 173)
After removing null columns - motif_features shape: (7, 100)
Common columns: 34
Common column names: ['COEFFICIENTS', 'Motif_4_cliques', 'Motif_4_cliques_per_Cn4', 'Motif_4_cycles', 'Motif_4_cycles_per_Cn4', 'Motif_induced_connected_per_4set', 'Motif_square_clustering_proxy', 'Motif_triangle_edge_frac_ge2', 'Motif_triangle_edge_frac_zero', 'Motif_triangle_edge_incidence_mean', 'Motif_triangle_edge_incidence_median', 'Motif_triangle_edge_incidence_q90', 'Motif_triangle_edge_incidence_std', 'Motif_triangles', 

In [8]:
# Process all loop files and merge them
print("=" * 60)
print("PROCESSING ALL LOOP FILES")
print("=" * 60)

successful_merges = []
failed_merges = []

for loop_num in sorted(common_loops):
    print(f"\n{'='*40}")
    print(f"Processing {loop_num}-loop files")
    print(f"{'='*40}")
    
    den_file = os.path.join(den_graphs_dir, den_file_map[loop_num])
    motif_file = os.path.join(motif_features_dir, motif_file_map[loop_num])
    output_file = os.path.join(output_dir, f"{loop_num}loops_merged.csv")
    
    try:
        success = merge_csv_files(den_file, motif_file, output_file)
        if success:
            successful_merges.append(loop_num)
            print(f"✅ Successfully merged {loop_num}-loop files")
        else:
            failed_merges.append(loop_num)
            print(f"❌ Failed to merge {loop_num}-loop files")
    except Exception as e:
        failed_merges.append(loop_num)
        print(f"❌ Error processing {loop_num}-loop files: {str(e)}")

print(f"\n{'='*60}")
print("SUMMARY")
print(f"{'='*60}")
print(f"Successfully merged: {successful_merges}")
print(f"Failed merges: {failed_merges}")
print(f"Total processed: {len(common_loops)}")
print(f"Success rate: {len(successful_merges)}/{len(common_loops)} ({len(successful_merges)/len(common_loops)*100:.1f}%)")


PROCESSING ALL LOOP FILES

Processing 5-loop files

Processing: 5loopfeats_enhanced.csv + 5loops.csv
den_graphs shape: (7, 178)
motif_features shape: (7, 101)
Checking for columns with null/nan values...
Removing 5 columns with nulls from den_graphs: ['Basic_degree_skew', 'Assortativity_degree', 'Centrality_closeness_skew', 'Centrality_eigenvector_skew', 'Spectral_lap_eig_9']
Removing 1 columns with nulls from motif_features: ['Spectral_lap_eig_9']
After removing null columns - den_graphs shape: (7, 173)
After removing null columns - motif_features shape: (7, 100)
Common columns: 34
Common column names: ['COEFFICIENTS', 'Motif_4_cliques', 'Motif_4_cliques_per_Cn4', 'Motif_4_cycles', 'Motif_4_cycles_per_Cn4', 'Motif_induced_connected_per_4set', 'Motif_square_clustering_proxy', 'Motif_triangle_edge_frac_ge2', 'Motif_triangle_edge_frac_zero', 'Motif_triangle_edge_incidence_mean', 'Motif_triangle_edge_incidence_median', 'Motif_triangle_edge_incidence_q90', 'Motif_triangle_edge_incidence_st

In [9]:
# Verify the merged files were created
print("=" * 60)
print("VERIFICATION")
print("=" * 60)

# List files in the output directory
output_files = os.listdir(output_dir)
csv_files = [f for f in output_files if f.endswith('.csv')]

print(f"Files created in {output_dir}:")
for file in sorted(csv_files):
    file_path = os.path.join(output_dir, file)
    file_size = os.path.getsize(file_path)
    print(f"  - {file} ({file_size:,} bytes)")

print(f"\nTotal merged files created: {len(csv_files)}")

# Show a sample of one merged file
if csv_files:
    sample_file = os.path.join(output_dir, csv_files[0])
    sample_df = pd.read_csv(sample_file)
    print(f"\nSample of {csv_files[0]}:")
    print(f"Shape: {sample_df.shape}")
    print(f"Columns: {len(sample_df.columns)}")
    print(f"First few column names: {list(sample_df.columns[:10])}")
    print(f"First few rows:")
    print(sample_df.head(2))


VERIFICATION
Files created in /Users/rezadoobary/Documents/MLCORRELATORS/ML-correlator/Tree_classifier_for_graphs/new_stuff/features/merged/new_merged:
  - 10loops_merged.csv (434,097,040 bytes)
  - 11loops_merged.csv (4,839,179,828 bytes)
  - 5loops_merged.csv (23,330 bytes)
  - 6loops_merged.csv (82,500 bytes)
  - 7loops_merged.csv (440,700 bytes)
  - 8loops_merged.csv (3,933,396 bytes)
  - 9loops_merged.csv (39,497,055 bytes)

Total merged files created: 7

Sample of 5loops_merged.csv:
Shape: (7, 239)
Columns: 239
First few column names: ['COEFFICIENTS', 'Basic_num_nodes', 'Basic_num_edges', 'Basic_min_degree', 'Basic_max_degree', 'Basic_avg_degree', 'Basic_degree_std', 'Basic_density', 'Basic_edge_to_node_ratio', 'Basic_degree_entropy']
First few rows:
   COEFFICIENTS  Basic_num_nodes  Basic_num_edges  Basic_min_degree  \
0             1                9               21                 4   
1             1                9               20                 4   

   Basic_max_degree