In [6]:
import pandas as pd

def segment_by_time_gap(coordinates, timestamps, gap_hours=2):
    """
    Segment coordinates based on time gaps greater than specified hours.
    
    Parameters:
    - coordinates: list of tuples (X, Y) or array of coordinates
    - timestamps: list of datetime objects or timestamps
    - gap_hours: time gap threshold in hours (default: 2)
    
    Returns:
    - List of segments, where each segment is a list of (coordinate, timestamp) tuples
    """
    if len(coordinates) == 0 or len(timestamps) == 0:
        return []
    
    # Convert timestamps to pandas datetime if not already
    timestamps = pd.to_datetime(timestamps)
    
    # Calculate time differences between consecutive points
    time_diffs = timestamps[1:] - timestamps[:-1]
    
    # Find indices where gap is greater than threshold
    gap_threshold = pd.Timedelta(hours=gap_hours)
    split_indices = [0]  # Start with first index
    
    for i, diff in enumerate(time_diffs):
        if diff > gap_threshold:
            split_indices.append(i + 1)
    
    split_indices.append(len(coordinates))  # Add end index
    
    # Create segments
    segments = []
    for i in range(len(split_indices) - 1):
        start_idx = split_indices[i]
        end_idx = split_indices[i + 1]
        segment = list(zip(coordinates[start_idx:end_idx], timestamps[start_idx:end_idx]))
        segments.append(segment)
    
    return segments


def add_segment_column_to_csv(file_path, gap_hours=2):
    """
    Add a 'segment' column to CSV data based on time gaps.
    
    Parameters:
    - file_path: path to the CSV file
    - gap_hours: time gap threshold in hours (default: 2)
    
    Returns:
    - DataFrame with added 'segment' column
    """
    # Read CSV - first row has headers
    data = pd.read_csv(file_path)
    
    # Convert Timestamp to datetime
    data['Timestamp'] = pd.to_datetime(data['Timestamp'])
    
    # Segment by time gap
    coordinates = list(zip(data['X'], data['Y']))
    timestamps = data['Timestamp'].tolist()
    
    segments = segment_by_time_gap(coordinates, timestamps, gap_hours)
    
    # Create a segment column
    segment_col = []
    for seg_idx, segment in enumerate(segments):
        segment_col.extend([seg_idx] * len(segment))
    
    data['segment'] = segment_col
    
    return data


# Example usage with the file from context
example_file = 'data/TERBINAFINE+/coordinates_highestspeed_20240827_11_3_with_time_speed.csv'
segmented_data = add_segment_column_to_csv(example_file)

print(f"Total rows: {len(segmented_data)}")
print(f"Number of segments: {segmented_data['segment'].nunique()}")
print(f"\nSegment sizes:")
print(segmented_data.groupby('segment').size())
print("\nFirst few rows:")
print(segmented_data.tail(10))

Total rows: 54000
Number of segments: 60

Segment sizes:
segment
0     901
1     900
2     900
3     900
4     900
5     900
6     900
7     900
8     900
9     900
10    900
11    900
12    900
13    900
14    900
15    900
16    900
17    900
18    900
19    900
20    900
21    900
22    900
23    900
24    900
25    900
26    900
27    900
28    900
29    900
30    900
31    900
32    900
33    900
34    900
35    900
36    900
37    900
38    900
39    900
40    900
41    900
42    900
43    900
44    900
45    900
46    900
47    900
48    900
49    900
50    900
51    900
52    900
53    900
54    900
55    900
56    900
57    900
58    900
59    899
dtype: int64

First few rows:
       GlobalFrame                  Timestamp     Speed      Fragment  \
53990        53991 2024-09-11 10:12:40.003827  0.159871  fragment_4_3   
53991        53992 2024-09-11 10:12:42.003887  0.043081  fragment_4_3   
53992        53993 2024-09-11 10:12:44.004427  0.345708  fragment_4_3   
53993        

In [7]:
import os
from pathlib import Path

def process_all_coordinate_files(input_dir='data', output_dir='data/partly_processed', gap_hours=2):
    """
    Process all coordinate CSV files by adding segment information.
    
    Parameters:
    - input_dir: root directory containing TERBINAFINE folders (default: 'data')
    - output_dir: directory to save processed files (default: 'data/partly_processed')
    - gap_hours: time gap threshold in hours for segmentation (default: 2)
    
    Returns:
    - Dictionary with processing statistics
    """
    # Create output directory if it doesn't exist
    output_path = Path(output_dir)
    output_path.mkdir(parents=True, exist_ok=True)
    
    # Process both TERBINAFINE- and TERBINAFINE+ folders
    folders = ['TERBINAFINE- (control)', 'TERBINAFINE+']
    stats = {
        'total_files': 0,
        'processed_files': 0,
        'failed_files': [],
        'files_by_folder': {}
    }
    
    for folder in folders:
        folder_path = Path(input_dir) / folder
        
        if not folder_path.exists():
            print(f"Folder not found: {folder_path}")
            continue
        
        # Create corresponding output subfolder
        output_subfolder = output_path / folder
        output_subfolder.mkdir(parents=True, exist_ok=True)
        
        # Find all coordinate CSV files
        csv_files = list(folder_path.glob('coordinates_*.csv'))
        stats['files_by_folder'][folder] = len(csv_files)
        
        print(f"\nProcessing {len(csv_files)} files from {folder}...")
        
        for csv_file in csv_files:
            stats['total_files'] += 1
            
            try:
                # Process the file
                segmented_data = add_segment_column_to_csv(str(csv_file), gap_hours)
                
                # Save to output directory with same filename
                output_file = output_subfolder / csv_file.name
                segmented_data.to_csv(output_file, index=False)
                
                stats['processed_files'] += 1
                
                # Print progress every 10 files
                if stats['processed_files'] % 10 == 0:
                    print(f"  Processed {stats['processed_files']}/{stats['total_files']} files...")
                
            except Exception as e:
                print(f"  Error processing {csv_file.name}: {str(e)}")
                stats['failed_files'].append(str(csv_file))
    
    print(f"\n{'='*60}")
    print(f"Processing complete!")
    print(f"Total files: {stats['total_files']}")
    print(f"Successfully processed: {stats['processed_files']}")
    print(f"Failed: {len(stats['failed_files'])}")
    
    for folder, count in stats['files_by_folder'].items():
        print(f"  {folder}: {count} files")
    
    if stats['failed_files']:
        print(f"\nFailed files:")
        for f in stats['failed_files']:
            print(f"  - {f}")
    
    return stats


# Run the processing
processing_stats = process_all_coordinate_files(gap_hours=2)


Processing 52 files from TERBINAFINE- (control)...
  Processed 10/10 files...
  Processed 10/10 files...
  Processed 20/20 files...
  Processed 20/20 files...
  Processed 30/30 files...
  Processed 30/30 files...
  Processed 40/40 files...
  Processed 40/40 files...
  Processed 50/50 files...
  Processed 50/50 files...

Processing 52 files from TERBINAFINE+...

Processing 52 files from TERBINAFINE+...
  Processed 60/60 files...
  Processed 60/60 files...
  Processed 70/70 files...
  Processed 70/70 files...
  Processed 80/80 files...
  Processed 80/80 files...
  Processed 90/90 files...
  Processed 90/90 files...
  Processed 100/100 files...
  Processed 100/100 files...

Processing complete!
Total files: 104
Successfully processed: 104
Failed: 0
  TERBINAFINE- (control): 52 files
  TERBINAFINE+: 52 files

Processing complete!
Total files: 104
Successfully processed: 104
Failed: 0
  TERBINAFINE- (control): 52 files
  TERBINAFINE+: 52 files


In [None]:
# Now rename the files by their wormid directly and save them all in the same processed folder

import os
from pathlib import Path

input_dir = 'data/partly_processed'
output_dir = 'data/processed'

Path(output_dir).mkdir(parents=True, exist_ok=True)

processed_count = 0
for folder in ['TERBINAFINE- (control)', 'TERBINAFINE+']:
    folder_path = Path(input_dir) / folder
    csv_files = list(folder_path.glob('coordinates_*.csv'))
    
    print(f"Processing {len(csv_files)} files from {folder}...")
    
    for csv_file in csv_files:
        # Extract wormid from filename
        # Format: coordinates_highestspeed_YYYYMMDD_XX_Y_with_time_speed.csv
        # We want: YYYYMMDD_XX_Y
        filename_parts = csv_file.stem.split('_')  # Use stem to remove .csv
        # Parts: ['coordinates', 'highestspeed', 'YYYYMMDD', 'XX', 'Y', 'with', 'time', 'speed']
        date = filename_parts[2]
        hour = filename_parts[3]
        worm_num = filename_parts[4]
        wormid = f"{date}_{hour}_{worm_num}"
        
        # Define new output file path
        output_file = Path(output_dir) / f'{wormid}.csv'
        
        # Copy the file to the new location with the new name
        segmented_data = pd.read_csv(csv_file)
        segmented_data.to_csv(output_file, index=False)
        processed_count += 1

print(f"\nProcessing complete! Renamed and saved {processed_count} files to {output_dir}")
