In [2]:
import pandas as pd
import csv
import io
import os

In [None]:
# Increase field size limit
csv.field_size_limit(1000000000)

# Define the expected columns
expected_columns = ['patent_id', 'claim_sequence', 'claim_text', 'dependent', 
                   'claim_number', 'exemplary']

def process_csv_safely(filename, output_file='processed_patent_claims.csv', chunk_size=5000):
    # Check file size
    file_size = os.path.getsize(filename) / (1024 * 1024)  # Size in MB
    print(f"Input file size: {file_size:.2f} MB")
    
    first_chunk = True
    lines_processed = 0
    
    try:
        # Check for header
        with open(filename, 'r', encoding='utf-8') as f:
            first_line = f.readline().strip()
            has_header = any(col in first_line.lower() for col in expected_columns)
        
        # Read CSV in chunks
        reader = pd.read_csv(
            filename,
            chunksize=chunk_size,
            encoding='utf-8',
            on_bad_lines='warn',
            names=expected_columns,
            dtype={
                'patent_id': str,
                'claim_sequence': str,
                'claim_text': str,
                'dependent': str,
                'claim_number': str,
                'exemplary': str
            },
            quoting=csv.QUOTE_ALL,
            escapechar='\\',
            engine='python',
            skiprows=1 if has_header else 0
        )
        
        # Process each chunk and append to output file
        for i, chunk in enumerate(reader):
            try:
                # Clean and convert columns
                chunk['patent_id'] = chunk['patent_id'].fillna('').astype(str)
                
                chunk['claim_sequence'] = pd.to_numeric(chunk['claim_sequence'], 
                                                      errors='coerce').fillna(0).astype(int)
                
                chunk['claim_text'] = chunk['claim_text'].fillna('').astype(str)
                chunk['dependent'] = chunk['dependent'].fillna('').astype(str)
                
                chunk['claim_number'] = pd.to_numeric(chunk['claim_number'], 
                                                    errors='coerce').fillna(0).astype(int)
                
                chunk['exemplary'] = chunk['exemplary'].fillna('').astype(str)
                
                # Write chunk to file
                mode = 'w' if first_chunk else 'a'
                header = first_chunk
                chunk.to_csv(output_file, mode=mode, header=header, index=False)
                
                lines_processed += len(chunk)
                print(f"Processed chunk {i + 1}: {lines_processed} lines total")
                
                first_chunk = False
                
                # Clear memory
                del chunk
                
            except Exception as e:
                print(f"Error processing chunk {i + 1}: {e}")
                continue
        
        print(f"Successfully processed {lines_processed} lines total")
        print(f"Output saved to {output_file}")
        return True
        
    except Exception as e:
        print(f"Critical error processing file: {e}")
        return False

# Test with a smaller chunk size first
success = process_csv_safely('patent_claims.csv', chunk_size=50000)

# If successful, you can read the processed file later if needed
if success:
    # Optional: Read a sample of the output
    df_sample = pd.read_csv('processed_patent_claims.csv', nrows=5)
    print("\nFirst few rows of processed data:")
    print(df_sample)

Input file size: 12165.25 MB
Processed chunk 1: 5000 lines total
Processed chunk 2: 10000 lines total
Processed chunk 3: 15000 lines total
Processed chunk 4: 20000 lines total
Processed chunk 5: 25000 lines total
Processed chunk 6: 30000 lines total
Processed chunk 7: 35000 lines total
Processed chunk 8: 40000 lines total
Processed chunk 9: 45000 lines total
Processed chunk 10: 50000 lines total
Processed chunk 11: 55000 lines total
Processed chunk 12: 60000 lines total
Processed chunk 13: 65000 lines total
Processed chunk 14: 70000 lines total
Processed chunk 15: 75000 lines total
Processed chunk 16: 80000 lines total
Processed chunk 17: 85000 lines total
Processed chunk 18: 90000 lines total
Processed chunk 19: 95000 lines total
Processed chunk 20: 100000 lines total
Processed chunk 21: 105000 lines total
Processed chunk 22: 110000 lines total
Processed chunk 23: 115000 lines total
Processed chunk 24: 120000 lines total
Processed chunk 25: 125000 lines total
Processed chunk 26: 13000

In [5]:
patents = pd.read_csv('processed_patent_claims.csv', nrows=1000)
print(patents.head())

  patent_id  claim_sequence  \
0   4094757              11   
1   4074648               1   
2   RE29720               9   
3   4077678               7   
4   4113836               0   

                                          claim_text dependent  claim_number  \
0  12. In a process for the curing of Chlorobutyl...       NaN            12   
1  2. The adaptive autopilot of claim 1 further i...   claim 1             2   
2  10. An electronic timepiece as recited in clai...   claim 1            10   
3  8. The apparatus of claim 1, wherein an inner ...   claim 1             8   
4  1. Apparatus for continuously calcining finely...       NaN             1   

   exemplary  
0          0  
1          0  
2          0  
3          0  
4          0  


In [10]:
chunk_size = 50000
output_file = 'cleaned_patent_claims.csv'
first_chunk = True

for chunk in pd.read_csv('processed_patent_claims.csv', chunksize=chunk_size):
    print(f"Chunk with {len(chunk)} rows:")
    print(chunk.head())
    
    # Save to file
    mode = 'w' if first_chunk else 'a'  # Write (overwrite) for first chunk, append for others
    header = first_chunk  # Only write header for first chunk
    chunk.to_csv(output_file, mode=mode, header=header, index=False)
    
    first_chunk = False
    # break  # Uncomment to process only one chunk for testing

print(f"All chunks processed and saved to {output_file}")

Chunk with 50000 rows:
  patent_id  claim_sequence  \
0   4094757              11   
1   4074648               1   
2   RE29720               9   
3   4077678               7   
4   4113836               0   

                                          claim_text dependent  claim_number  \
0  12. In a process for the curing of Chlorobutyl...       NaN            12   
1  2. The adaptive autopilot of claim 1 further i...   claim 1             2   
2  10. An electronic timepiece as recited in clai...   claim 1            10   
3  8. The apparatus of claim 1, wherein an inner ...   claim 1             8   
4  1. Apparatus for continuously calcining finely...       NaN             1   

   exemplary  
0          0  
1          0  
2          0  
3          0  
4          0  
Chunk with 50000 rows:
      patent_id  claim_sequence  \
50000   4106336               1   
50001   4091446               3   
50002   4092841               3   
50003   4074832               2   
50004   4082961      