In [8]:
import pandas as pd

In [13]:
def split_csv(file_path, output_folder, chunk_size=5000, sep=','):
    """
    Splits a large CSV file into multiple smaller CSV files.

    Parameters:
    file_path (str): Path to the input CSV file.
    output_folder (str): Folder where the smaller CSV files will be saved.
    chunk_size (int): Maximum number of rows per smaller CSV file.
    """
    chunk_iter = pd.read_csv(file_path, chunksize=chunk_size, sep=sep, error_bad_lines=False)
    
    for i, chunk in enumerate(chunk_iter):
        output_file = f"{output_folder}/chunk_{i + 1}.csv"
        chunk.to_csv(output_file, index=False)
        print(f"Saved {output_file}")

file_path = 'Books/Books Dataset/books.csv'  
output_folder = 'Books/Books Dataset' 

split_csv(file_path, output_folder, sep=';')


Saved Books/Books Dataset/chunk_1.csv
Saved Books/Books Dataset/chunk_2.csv
Saved Books/Books Dataset/chunk_3.csv
Saved Books/Books Dataset/chunk_4.csv


b'Skipping line 6452: expected 8 fields, saw 9\n'


Saved Books/Books Dataset/chunk_5.csv
Saved Books/Books Dataset/chunk_6.csv
Saved Books/Books Dataset/chunk_7.csv
Saved Books/Books Dataset/chunk_8.csv


b'Skipping line 43667: expected 8 fields, saw 10\n'
b'Skipping line 51751: expected 8 fields, saw 9\n'


Saved Books/Books Dataset/chunk_9.csv
Saved Books/Books Dataset/chunk_10.csv
Saved Books/Books Dataset/chunk_11.csv
Saved Books/Books Dataset/chunk_12.csv
Saved Books/Books Dataset/chunk_13.csv
Saved Books/Books Dataset/chunk_14.csv
Saved Books/Books Dataset/chunk_15.csv
Saved Books/Books Dataset/chunk_16.csv
Saved Books/Books Dataset/chunk_17.csv
Saved Books/Books Dataset/chunk_18.csv
Saved Books/Books Dataset/chunk_19.csv
Saved Books/Books Dataset/chunk_20.csv
Saved Books/Books Dataset/chunk_21.csv


b'Skipping line 92038: expected 8 fields, saw 9\n'
b'Skipping line 104319: expected 8 fields, saw 9\n'


Saved Books/Books Dataset/chunk_22.csv
Saved Books/Books Dataset/chunk_23.csv
Saved Books/Books Dataset/chunk_24.csv
Saved Books/Books Dataset/chunk_25.csv


b'Skipping line 121768: expected 8 fields, saw 9\n'


Saved Books/Books Dataset/chunk_26.csv
Saved Books/Books Dataset/chunk_27.csv
Saved Books/Books Dataset/chunk_28.csv
Saved Books/Books Dataset/chunk_29.csv


b'Skipping line 144058: expected 8 fields, saw 9\n'
b'Skipping line 150789: expected 8 fields, saw 9\n'
b'Skipping line 157128: expected 8 fields, saw 9\n'


Saved Books/Books Dataset/chunk_30.csv
Saved Books/Books Dataset/chunk_31.csv
Saved Books/Books Dataset/chunk_32.csv
Saved Books/Books Dataset/chunk_33.csv
Saved Books/Books Dataset/chunk_34.csv
Saved Books/Books Dataset/chunk_35.csv
Saved Books/Books Dataset/chunk_36.csv
Saved Books/Books Dataset/chunk_37.csv


b'Skipping line 180189: expected 8 fields, saw 9\n'
b'Skipping line 185738: expected 8 fields, saw 9\n'


Saved Books/Books Dataset/chunk_38.csv
Saved Books/Books Dataset/chunk_39.csv
Saved Books/Books Dataset/chunk_40.csv
Saved Books/Books Dataset/chunk_41.csv


b'Skipping line 209388: expected 8 fields, saw 9\n'
b'Skipping line 220626: expected 8 fields, saw 9\n'


Saved Books/Books Dataset/chunk_42.csv
Saved Books/Books Dataset/chunk_43.csv
Saved Books/Books Dataset/chunk_44.csv
Saved Books/Books Dataset/chunk_45.csv


b'Skipping line 227933: expected 8 fields, saw 11\nSkipping line 228957: expected 8 fields, saw 10\n'


Saved Books/Books Dataset/chunk_46.csv
Saved Books/Books Dataset/chunk_47.csv
Saved Books/Books Dataset/chunk_48.csv
Saved Books/Books Dataset/chunk_49.csv


b'Skipping line 245933: expected 8 fields, saw 9\n'
b'Skipping line 251296: expected 8 fields, saw 9\n'
b'Skipping line 259941: expected 8 fields, saw 9\n'
b'Skipping line 261529: expected 8 fields, saw 9\n'


Saved Books/Books Dataset/chunk_50.csv
Saved Books/Books Dataset/chunk_51.csv
Saved Books/Books Dataset/chunk_52.csv
Saved Books/Books Dataset/chunk_53.csv
Saved Books/Books Dataset/chunk_54.csv
Saved Books/Books Dataset/chunk_55.csv


In [2]:
def max_string_length(file_path):
    """
    Calculate the maximum string length for each column in a CSV file.

    Parameters:
    file_path (str): Path to the CSV file.

    Returns:
    dict: A dictionary with column names as keys and maximum string lengths as values.
    """
    df = pd.read_csv(file_path)
    max_lengths = {col: df[col].astype(str).map(len).max() for col in df.columns}
    return max_lengths

In [7]:
max_string_length('Books/Books Dataset/cu.csv')

{'userID': 6, 'Location': 103, 'Age': 2}