In [1]:
import csv
import sys
import os

In [2]:
# Define the input and output file names
input_filename = 'DX-Y.NYB_15year_daily_data.csv'
output_filename = 'Final USDX_processed_data.csv'

# Indices for the columns we want (0-based)
# Date is the 1st element (index 0)
# Close is the 5th element (index 4)
# Volume is the 6th element (index 5)
date_index = 0
close_index = 4
volume_index = 5

print(f"Processing {input_filename} directly to {output_filename}...")

# Check if the input file exists and is readable
if not os.path.exists(input_filename):
    print(f"ERROR: Input file '{input_filename}' does not exist!")
    sys.exit(1)
else:
    print(f"Input file '{input_filename}' found.")
    # Check file size
    file_size = os.path.getsize(input_filename)
    print(f"Input file size: {file_size} bytes")

Processing DX-Y.NYB_15year_daily_data.csv directly to Final USDX_processed_data.csv...
Input file 'DX-Y.NYB_15year_daily_data.csv' found.
Input file size: 359366 bytes


In [3]:
try:
    # First, let's check what's in the file by reading the first few lines
    print("Previewing first 5 lines of input file:")
    with open(input_filename, 'r', encoding='utf-8') as preview:
        for i, line in enumerate(preview):
            if i < 5:  # Just show first 5 lines
                print(f"Line {i+1}: {line.strip()}")
            else:
                break
    
    # Now process the file
    rows_processed = 0
    rows_written = 0
    
    with open(input_filename, 'r', encoding='utf-8') as infile, \
         open(output_filename, 'w', newline='', encoding='utf-8') as outfile:
        # Create a CSV writer object
        writer = csv.writer(outfile)
        # Write the header row for the output file with the new column names
        writer.writerow(['Date', 'USDX Close', 'USDX Volume'])
        print("Header row written to output file with columns: Date, USDX Close, USDX Volume")
        
        # Read the input file line by line
        line_number = 0
        for line in infile:
            line_number += 1
            rows_processed += 1
            
            # Skip the header line (first line)
            if line_number == 1:
                # Optional: Check if the header matches expectations
                expected_header = "Date,Open,High,Low,Close,Volume"
                if line.strip() != expected_header:
                    print(f"Warning: Unexpected header found on line 1: '{line.strip()}'")
                continue
                
            # Process data lines
            line = line.strip()
            if not line:  # Skip empty lines
                print(f"Skipping empty line at line {line_number}")
                continue
                
            # Debug: Print the line being processed
            if line_number <= 5 or line_number % 1000 == 0:
                print(f"Processing line {line_number}: {line[:50]}...")
                
            # Remove leading/trailing parentheses and whitespace
            if line.startswith('(') and line.endswith(')'):
                content = line[1:-1].strip()
            else:
                # Handle lines that might not have parentheses
                content = line
                print(f"Warning: Line {line_number} does not start/end with parentheses: '{line}'")
                
            try:
                # Split the content by comma
                fields = content.split(',')
                # Check if we have enough fields
                if len(fields) > max(date_index, close_index, volume_index):
                    # Extract the required fields
                    date_val = fields[date_index].strip()
                    close_val = fields[close_index].strip()
                    volume_val = fields[volume_index].strip()
                    
                    # Clean the date format by removing time part if it exists
                    cleaned_date = date_val.split(' ')[0]  # Takes part before first space
                    
                    # Write the extracted and cleaned data to the output file
                    writer.writerow([cleaned_date, close_val, volume_val])
                    rows_written += 1
                    
                    # Debug: Print every 1000th row written
                    if rows_written == 1 or rows_written % 1000 == 0:
                        print(f"Wrote row {rows_written}: {cleaned_date}, {close_val}, {volume_val}")
                else:
                    print(f"Warning: Line {line_number} has fewer fields than expected ({len(fields)}): '{line}'")
            except Exception as split_error:
                print(f"Error processing line {line_number}: '{line}'. Error: {split_error}")
                
    print(f"Processing complete. Total rows processed: {rows_processed}, rows written: {rows_written}")
    print(f"Output saved to {output_filename}")
    
    # Check if output file exists and has content
    if os.path.exists(output_filename):
        output_size = os.path.getsize(output_filename)
        print(f"Output file size: {output_size} bytes")
        if output_size == 0:
            print("ERROR: Output file is empty!")
        else:
            print("Output file contains data.")
    
except FileNotFoundError:
    print(f"Error: Input file '{input_filename}' not found.")
    sys.exit(1)
except Exception as e:
    print(f"An error occurred: {e}")
    import traceback
    traceback.print_exc()  # Print detailed error information
    # Attempt to remove the potentially incomplete output file if error occurs
    if os.path.exists(output_filename):
        try:
            os.remove(output_filename)
            print(f"Removed incomplete output file: {output_filename}")
        except OSError as remove_error:
            print(f"Error removing incomplete output file {output_filename}: {remove_error}")
    sys.exit(1)

Previewing first 5 lines of input file:
Line 1: Date,Open,High,Low,Close,Volume
Line 2: 2010-05-12 12:00:00,84.70999908447266,84.93000030517578,84.20999908447266,84.83000183105469,0.0
Line 3: 2010-05-13 12:00:00,84.80000305175781,85.45999908447266,84.54000091552734,85.20999908447266,0.0
Line 4: 2010-05-14 12:00:00,85.36000061035156,86.29000091552734,85.19999694824219,86.0999984741211,0.0
Line 5: 2010-05-17 12:00:00,86.20999908447266,87.05999755859375,86.05000305175781,86.20999908447266,0.0
Header row written to output file with columns: Date, USDX Close, USDX Volume
Processing line 2: 2010-05-12 12:00:00,84.70999908447266,84.930000305...
Wrote row 1: 2010-05-12, 84.83000183105469, 0.0
Processing line 3: 2010-05-13 12:00:00,84.80000305175781,85.459999084...
Processing line 4: 2010-05-14 12:00:00,85.36000061035156,86.290000915...
Processing line 5: 2010-05-17 12:00:00,86.20999908447266,87.059997558...
Processing line 1000: 2014-04-30 12:00:00,79.80999755859375,79.919998168...
Wrote row 1