In [3]:
# row_to_json.py
import pandas as pd
import json
from datetime import datetime

def extract_row_as_json(csv_path, timestamp_str, output_file=None):
    """
    Extract a specific row from the raw dataset and convert it to JSON format.
    
    Args:
        csv_path: Path to the CSV file
        timestamp_str: Timestamp string to find (e.g., "2014-11-15 06:30:00")
        output_file: Optional path to save JSON file
    
    Returns:
        dict: Row data as dictionary/JSON
    """
    # Load the raw dataset
    df = pd.read_csv(csv_path, parse_dates=["date_time"])
    
    # Find the specific row
    target_ts = pd.to_datetime(timestamp_str)
    row = df.loc[df["date_time"] == target_ts]
    
    if row.empty:
        # Try to find nearest row within 5 minutes
        nearest_idx = (df["date_time"] - target_ts).abs().argsort()[:1]
        row = df.iloc[nearest_idx]
        actual_ts = row.iloc[0]["date_time"]
        time_diff = abs(actual_ts - target_ts)
        if time_diff > pd.Timedelta(minutes=5):
            raise ValueError(f"No row found within 5 minutes of {timestamp_str}")
        print(f"Using nearest row at {actual_ts} (diff: {time_diff})")
    
    # Convert row to dictionary
    row_dict = row.iloc[0].to_dict()
    
    # Convert timestamps to string format for JSON serialization
    for key, value in row_dict.items():
        if isinstance(value, pd.Timestamp):
            row_dict[key] = value.strftime("%Y-%m-%d %H:%M:%S")
        elif pd.isna(value):
            row_dict[key] = None
    
    # Save to file if requested
    if output_file:
        with open(output_file, 'w') as f:
            json.dump(row_dict, f, indent=2)
        print(f"Row data saved to {output_file}")
    
    return row_dict

def show_sample_rows(csv_path, n=5):
    """Show first n rows to help select a timestamp"""
    df = pd.read_csv(csv_path, parse_dates=["date_time"])
    print(f"Dataset shape: {df.shape}")
    print(f"Date range: {df['date_time'].min()} to {df['date_time'].max()}")
    print(f"\nFirst {n} rows:")
    print(df[['date_time'] + [col for col in df.columns if col != 'date_time'][:5]].head(n))
    print(f"\nColumn names: {list(df.columns)}")

# Example usage
if __name__ == "__main__":
    csv_path = "wt84_with_alarms.csv"  # Update with your CSV path
    
    # Show sample data to help choose a timestamp
    show_sample_rows(csv_path)
    
    # Extract a specific row (update timestamp as needed)
    try:
        row_json = extract_row_as_json(csv_path, "2012-04-06 01:05:00", "sample_row.json")
        print("\nExtracted row JSON:")
        print(json.dumps(row_json, indent=2))
    except Exception as e:
        print(f"Error: {e}")

Dataset shape: (210698, 319)
Date range: 2012-01-01 01:05:00 to 2014-12-08 06:20:00

First 5 rows:
            date_time  turbine_id  wgdc_avg_TriGri_PhV_phsA  \
0 2012-01-01 01:05:00          84                   392.239   
1 2012-01-01 01:10:00          84                   392.714   
2 2012-01-01 01:15:00          84                   393.002   
3 2012-01-01 01:20:00          84                   393.793   
4 2012-01-01 01:25:00          84                   394.844   

   wgdc_avg_TriGri_PhV_phsB  wgdc_avg_TriGri_PhV_phsC  wgdc_avg_TriGri_PhV  
0                   390.227                   392.632              391.241  
1                   390.730                   393.000              391.737  
2                   391.001                   393.061              392.002  
3                   391.750                   393.778              392.797  
4                   392.852                   394.829              393.850  

Column names: ['turbine_id', 'date_time', 'wgdc_avg_TriGri_