In [1]:
import dask.dataframe as dd
import pandas as pd
import numpy as np
import re

In [2]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# Used Cars Data Cleaner

Assuming that the raw data has been downloaded, this notebook takes the following class to clean it. 

1. Projec

In [4]:
df = pd.read_csv('/home/jwc/Data/UsedCars/used_cars_data.csv', nrows=1)
df

Unnamed: 0,vin,back_legroom,bed,bed_height,bed_length,body_type,cabin,city,city_fuel_economy,combine_fuel_economy,daysonmarket,dealer_zip,description,engine_cylinders,engine_displacement,engine_type,exterior_color,fleet,frame_damaged,franchise_dealer,franchise_make,front_legroom,fuel_tank_volume,fuel_type,has_accidents,height,highway_fuel_economy,horsepower,interior_color,isCab,is_certified,is_cpo,is_new,is_oemcpo,latitude,length,listed_date,listing_color,listing_id,longitude,main_picture_url,major_options,make_name,maximum_seating,mileage,model_name,owner_count,power,price,salvage,savings_amount,seller_rating,sp_id,sp_name,theft_title,torque,transmission,transmission_display,trimId,trim_name,vehicle_damage_category,wheel_system,wheel_system_display,wheelbase,width,year
0,ZACNJABB5KPJ92081,35.1 in,,,,SUV / Crossover,,Bayamon,,,522,960,[!@@Additional Info@@!]Engine: 2.4L I4 ZERO EV...,I4,1300.0,I4,Solar Yellow,,,True,Jeep,41.2 in,12.7 gal,Gasoline,,66.5 in,,177.0,Black,,,,True,,18.3988,166.6 in,2019-04-06,YELLOW,237132766,-66.1582,https://static.cargurus.com/images/forsale/202...,['Quick Order Package'],Jeep,5 seats,7.0,Renegade,,"177 hp @ 5,750 RPM",23141.0,,0,2.8,370599,Flagship Chrysler,,"200 lb-ft @ 1,750 RPM",A,9-Speed Automatic Overdrive,t83804,Latitude FWD,,FWD,Front-Wheel Drive,101.2 in,79.6 in,2019


In [5]:
from dask.diagnostics import ProgressBar
ProgressBar().register()

In [6]:
boolean_cols = ['frame_damaged', 'has_accidents', 'is_new']
numeric_cols = ['daysonmarket', 'height', 'horsepower', 'length', 'mileage', 'seller_rating']
categorical_cols = []
special_cols = ['price', 'listing_id']
useful_cols = boolean_cols + numeric_cols + categorical_cols + special_cols

In [7]:
# Helper function to clean numeric-like strings and handle NaN values like '--'
def clean_numeric(value):
    # Handle NaN cases
    if isinstance(value, str) and value.strip() == '--':
        return np.nan
    # Extract numeric part from values like 'X.XX in'
    if isinstance(value, str):
        match = re.search(r'[-+]?[0-9]*\.?[0-9]+', value)
        if match:
            return float(match.group(0))
        else:
            print(value)
    # Convert to numeric if possible
    return pd.to_numeric(value, errors='coerce')

# Step 1: Clean specific columns, retain useful columns, and save the intermediate cleaned CSV
def clean_and_save_csv(input_csv, output_csv):
    """
    Clean specific string columns that represent numeric concepts, retain useful columns, 
    and save the cleaned DataFrame to an intermediate CSV file.
    
    Parameters:
    input_csv (str): Path to the input CSV file.
    output_csv (str): Path to the output CSV file.
    """
    # Load the CSV with all columns as strings initially
    df = dd.read_csv(input_csv, dtype=str)
    
    # Retain useful columns
    df = df[useful_cols]
    df = df.loc[:, ~df.columns.duplicated()]
    
    # Convert specific columns to numeric by cleaning them
    for numeric_col in numeric_cols:
        df[numeric_col] = df[numeric_col].map_partitions(
            lambda s: s.apply(clean_numeric), meta=(numeric_col, 'f8')
        )

    for col in boolean_cols:
        # Convert strings like 'True'/'False' and 1/0 into boolean True/False, and anything else into NaN
        df[col] = df[col].apply(lambda x: 1.0 if x in ['True', 'true', '1', 1] else (0.0 if x in ['False', 'false', '0', 0] else np.nan), meta=('col', 'f8'))
    
    # Save the cleaned DataFrame to the output CSV
    df.to_csv(output_csv, single_file=True, index=False)

In [9]:
clean_and_save_csv('/home/jwc/Data/UsedCars/used_cars_data.csv', '/home/jwc/Data/UsedCars/used_cars_data_1.csv')

[########################################] | 100% Completed | 80.50 s


In [10]:
df = pd.read_csv('/home/jwc/Data/UsedCars/used_cars_data_1.csv', nrows=200, skiprows=range(1,0))
df

Unnamed: 0,frame_damaged,has_accidents,is_new,daysonmarket,height,horsepower,length,mileage,seller_rating,price,listing_id
0,,,1.0,522.0,66.5,177.0,166.6,7.0,2.8,23141.0,237132766
1,,,1.0,207.0,68.0,246.0,181.0,8.0,3.0,46500.0,265946296
2,0.0,0.0,0.0,1233.0,58.1,305.0,180.9,,,46995.0,173473508
3,,,1.0,196.0,73.0,340.0,195.1,11.0,3.0,67430.0,266911050
4,,,1.0,137.0,68.0,246.0,181.0,7.0,3.0,48880.0,270957414
5,0.0,0.0,1.0,242.0,66.3,247.0,188.9,12.0,3.0,66903.0,262940541
6,,,1.0,447.0,56.9,186.0,183.5,14.0,2.8,23695.0,244110426
7,,,1.0,70.0,66.3,247.0,188.9,11.0,3.0,68520.0,275458784
8,,,1.0,196.0,68.0,246.0,181.0,8.0,3.0,51245.0,266911040
9,0.0,0.0,0.0,510.0,64.9,296.0,172.1,254.0,3.0,84399.0,238225156


In [11]:
def gather_statistics(cleaned_csv):
    """
    Gather statistics from the cleaned CSV file:
    1. For numeric columns, compute mean (excluding NaN values).
    2. For categorical columns, compute mode.
    3. For boolean columns, compute the proportion of True values (excluding NaN/None values).
    
    Parameters:
    cleaned_csv (str): Path to the cleaned CSV file.
    
    Returns:
    dict: A dictionary with statistics for each column.
    """
    # Load the cleaned CSV
    df = dd.read_csv(cleaned_csv)
    
    # Initialize the dictionary to store statistics
    stats = {}
    
    # Process numeric columns
    for col in numeric_cols:
        print(col)
        if col in df.columns:
            mean_value = df[col].mean().compute()
            stats[col] = {'mean': mean_value}
    
    # Process categorical columns (mode)
    for col in categorical_cols:
        print(col)
        if col in df.columns:
            mode_value = df[col].mode().compute()[0] if not df[col].mode().compute().empty else None
            stats[col] = {'mode': mode_value}
    
    # Process boolean columns (proportion of True)
    for col in boolean_cols:
        if col in df.columns:
            mean_value = df[col].mean().compute()
            stats[col] = {'mean': mean_value}
    
    return stats

In [12]:
stats = gather_statistics('/home/jwc/Data/UsedCars/used_cars_data_1.csv')

daysonmarket
[########################################] | 100% Completed | 302.88 ms
height
[########################################] | 100% Completed | 304.44 ms
horsepower
[########################################] | 100% Completed | 304.99 ms
length
[########################################] | 100% Completed | 304.88 ms
mileage
[########################################] | 100% Completed | 303.81 ms
seller_rating
[########################################] | 100% Completed | 303.91 ms
[########################################] | 100% Completed | 304.05 ms
[########################################] | 100% Completed | 304.38 ms
[########################################] | 100% Completed | 303.58 ms


In [13]:
stats

{'daysonmarket': {'mean': 76.05972920361062},
 'height': {'mean': 65.8719297553817},
 'horsepower': {'mean': 247.9957102248012},
 'length': {'mean': 193.69298105772816},
 'mileage': {'mean': 31146.899743421207},
 'seller_rating': {'mean': 4.2704132058199535},
 'frame_damaged': {'mean': 0.009502079831198421},
 'has_accidents': {'mean': 0.1544089561440025},
 'is_new': {'mean': 0.49033912881161584}}

In [14]:
def impute_missing_values(cleaned_csv, output_csv, stats):
    """
    Impute missing values in the cleaned CSV file using statistics.
    
    Parameters:
    cleaned_csv (str): Path to the cleaned CSV file.
    output_csv (str): Path to the output CSV file after imputation.
    stats (dict): A dictionary with statistics (mean for numeric, mode for categorical, proportion for boolean).
    """
    # Load the cleaned CSV
    df = dd.read_csv(cleaned_csv)
    
    for col in numeric_cols:
        if col in stats and 'mean' in stats[col]:
            mean_value = stats[col]['mean']
            df[col] = df[col].fillna(mean_value)

    for col in categorical_cols:
        if col in stats and 'mode' in stats[col]:
            mode_value = stats[col]['mode']
            df[col] = df[col].fillna(mode_value)
    
    for col in boolean_cols:
        mean_value = stats[col]['mean']
        df[col] = df[col].fillna(mean_value)
    
    # Save the DataFrame with imputed values to a new CSV
    df.to_csv(output_csv, single_file=True, index=False)

In [15]:
impute_missing_values('/home/jwc/Data/UsedCars/used_cars_data_1.csv', '/home/jwc/Data/UsedCars/used_cars_data_2.csv', stats)

[########################################] | 100% Completed | 8.78 sms


In [16]:
df = pd.read_csv('/home/jwc/Data/UsedCars/used_cars_data_2.csv', nrows=100)
df

Unnamed: 0,frame_damaged,has_accidents,is_new,daysonmarket,height,horsepower,length,mileage,seller_rating,price,listing_id
0,0.009502,0.154409,1.0,522.0,66.5,177.0,166.6,7.0,2.8,23141.0,237132766
1,0.009502,0.154409,1.0,207.0,68.0,246.0,181.0,8.0,3.0,46500.0,265946296
2,0.0,0.0,0.0,1233.0,58.1,305.0,180.9,31146.899743,4.270413,46995.0,173473508
3,0.009502,0.154409,1.0,196.0,73.0,340.0,195.1,11.0,3.0,67430.0,266911050
4,0.009502,0.154409,1.0,137.0,68.0,246.0,181.0,7.0,3.0,48880.0,270957414
5,0.0,0.0,1.0,242.0,66.3,247.0,188.9,12.0,3.0,66903.0,262940541
6,0.009502,0.154409,1.0,447.0,56.9,186.0,183.5,14.0,2.8,23695.0,244110426
7,0.009502,0.154409,1.0,70.0,66.3,247.0,188.9,11.0,3.0,68520.0,275458784
8,0.009502,0.154409,1.0,196.0,68.0,246.0,181.0,8.0,3.0,51245.0,266911040
9,0.0,0.0,0.0,510.0,64.9,296.0,172.1,254.0,3.0,84399.0,238225156


In [17]:
def convert_to_numeric(input_csv, output_csv):
    """
    Convert all columns in the input CSV to numeric:
    - Boolean columns are converted to 1.0 for True and 0.0 for False.
    - Categorical columns are one-hot encoded.
    - Numeric columns are kept as is.
    
    Parameters:
    input_csv (str): Path to the input CSV file.
    output_csv (str): Path to the output CSV file with numeric values.
    """
    # Load the CSV
    df = dd.read_csv(input_csv)
    
    # One-hot encode categorical columns
    df = df.categorize(columns=categorical_cols)
    df = dd.get_dummies(df, columns=categorical_cols, drop_first=False)
    
    # Ensure numeric columns are still numeric (in case there were any issues with data types)
    for col in df:
        if col not in special_cols:
            df[col] = df[col].astype('float')
    
    # Save the DataFrame to a new CSV
    df.to_csv(output_csv, single_file=True, index=False)

In [None]:
convert_to_numeric('/home/jwc/Data/UsedCars/used_cars_data_2.csv', '/home/jwc/Data/UsedCars/used_cars_data_3.csv')

[##############################          ] | 75% Completed | 5.10 s ms

In [50]:
df = pd.read_csv('/home/jwc/Data/UsedCars/used_cars_data_3.csv', nrows=100)
df

Unnamed: 0,frame_damaged,has_accidents,is_new,daysonmarket,height,horsepower,length,mileage,seller_rating,price,listing_id
0,0.009502,0.154409,1.0,522.0,66.5,177.0,166.6,7.0,2.8,23141.0,237132766
1,0.009502,0.154409,1.0,207.0,68.0,246.0,181.0,8.0,3.0,46500.0,265946296
2,0.0,0.0,0.0,1233.0,58.1,305.0,180.9,31146.899743,4.270413,46995.0,173473508
3,0.009502,0.154409,1.0,196.0,73.0,340.0,195.1,11.0,3.0,67430.0,266911050
4,0.009502,0.154409,1.0,137.0,68.0,246.0,181.0,7.0,3.0,48880.0,270957414
5,0.0,0.0,1.0,242.0,66.3,247.0,188.9,12.0,3.0,66903.0,262940541
6,0.009502,0.154409,1.0,447.0,56.9,186.0,183.5,14.0,2.8,23695.0,244110426
7,0.009502,0.154409,1.0,70.0,66.3,247.0,188.9,11.0,3.0,68520.0,275458784
8,0.009502,0.154409,1.0,196.0,68.0,246.0,181.0,8.0,3.0,51245.0,266911040
9,0.0,0.0,0.0,510.0,64.9,296.0,172.1,254.0,3.0,84399.0,238225156


In [51]:
def min_max_normalize(input_file, output_file, special_cols=None):
    """
    Reads a CSV file, normalizes all columns except those specified in `special_cols` using Min-Max normalization,
    and saves the normalized DataFrame to a new CSV file.
    
    Parameters:
    input_file (str): Path to the input CSV file.
    output_file (str): Path to the output CSV file with normalized columns.
    special_cols (list): List of columns to exclude from normalization.
    """
    # Load the CSV file into a Dask DataFrame
    df = dd.read_csv(input_file)
    
    if special_cols is None:
        special_cols = []

    # Select the columns that will be normalized
    columns_to_normalize = [col for col in df.columns if col not in special_cols]

    # Compute min and max for all columns to be normalized in one pass
    col_mins = df[columns_to_normalize].min()
    col_maxs = df[columns_to_normalize].max()

    # Compute min and max together to avoid multiple computes
    col_mins, col_maxs = dd.compute(col_mins, col_maxs)

    # Normalize each column using min-max normalization
    for col in columns_to_normalize:
        if col_mins[col] != col_maxs[col]:  # Avoid division by zero if min equals max
            df[col] = (df[col] - col_mins[col]) / (col_maxs[col] - col_mins[col])
    
    # Save the normalized DataFrame to a new CSV file
    df.to_csv(output_file, single_file=True, index=False)

In [52]:
min_max_normalize('used_cars_data_3.csv', 'used_cars_data_4.csv', special_cols=special_cols)

[########################################] | 100% Completed | 811.38 ms
[########################################] | 100% Completed | 911.09 ms
[########################################] | 100% Completed | 1.01 s
[########################################] | 100% Completed | 1.01 s
[########################################] | 100% Completed | 15.04 ss
[########################################] | 100% Completed | 15.11 s
[########################################] | 100% Completed | 15.14 s
[########################################] | 100% Completed | 15.21 s


In [53]:
df = pd.read_csv('used_cars_data_4.csv', nrows=100)
df

Unnamed: 0,frame_damaged,has_accidents,is_new,daysonmarket,height,horsepower,length,mileage,seller_rating,price,listing_id
0,0.009502,0.154409,1.0,0.14504,0.310391,0.128964,0.321809,7.000001e-08,0.45,23141.0,237132766
1,0.009502,0.154409,1.0,0.057516,0.330634,0.201903,0.398404,8.000001e-08,0.5,46500.0,265946296
2,0.0,0.0,0.0,0.342595,0.197031,0.264271,0.397872,0.000311469,0.817603,46995.0,173473508
3,0.009502,0.154409,1.0,0.05446,0.398111,0.301268,0.473404,1.1e-07,0.5,67430.0,266911050
4,0.009502,0.154409,1.0,0.038066,0.330634,0.201903,0.398404,7.000001e-08,0.5,48880.0,270957414
5,0.0,0.0,1.0,0.067241,0.307692,0.20296,0.440426,1.2e-07,0.5,66903.0,262940541
6,0.009502,0.154409,1.0,0.124201,0.180837,0.138478,0.411702,1.4e-07,0.45,23695.0,244110426
7,0.009502,0.154409,1.0,0.01945,0.307692,0.20296,0.440426,1.1e-07,0.5,68520.0,275458784
8,0.009502,0.154409,1.0,0.05446,0.330634,0.201903,0.398404,8.000001e-08,0.5,51245.0,266911040
9,0.0,0.0,0.0,0.141706,0.288799,0.254757,0.351064,2.54e-06,0.5,84399.0,238225156


In [54]:
# Get the number of rows
df = dd.read_csv('used_cars_data.csv', dtype=str)
n_rows = df.shape[0].compute()
print(f"Number of rows: {n_rows}")

[########################################] | 100% Completed | 47.42 s
[########################################] | 100% Completed | 47.52 s
[########################################] | 100% Completed | 47.52 s
[########################################] | 100% Completed | 47.61 s
Number of rows: 3000040


In [55]:
import dask.dataframe as dd

def shuffle_and_split(input_file, output_train, output_test, output_val, random_state=42):
    """
    Shuffle a CSV file and split it into train, test, and validation sets with a 1:1:1 ratio.
    
    Parameters:
    input_file (str): Path to the input CSV file.
    output_train (str): Path to the output CSV file for the training set.
    output_test (str): Path to the output CSV file for the test set.
    output_val (str): Path to the output CSV file for the validation set.
    random_state (int): Random seed for shuffling.
    """
    # Load the CSV file into a Dask DataFrame
    df = dd.read_csv(input_file)

    # Shuffle the data by sampling all rows (frac=1.0 means shuffle the entire dataset)
    df_shuffled = df.sample(frac=1.0, random_state=random_state)

    # Split the shuffled data into train, test, and val (1:1:1 ratio)
    train, test, val = df_shuffled.random_split([1/3, 1/3, 1/3], random_state=random_state)

    # Save each split to separate CSV files
    train.to_csv(output_train, single_file=True, index=False)
    test.to_csv(output_test, single_file=True, index=False)
    val.to_csv(output_val, single_file=True, index=False)

In [56]:
shuffle_and_split('used_cars_data_4.csv', 'used_cars_train.csv', 'used_cars_test.csv', 'used_cars_val.csv', random_state=42)

[########################################] | 100% Completed | 5.80 ss
[########################################] | 100% Completed | 5.87 s
[########################################] | 100% Completed | 5.97 s
[########################################] | 100% Completed | 6.07 s
[########################################] | 100% Completed | 5.90 ss
[########################################] | 100% Completed | 6.00 s
[########################################] | 100% Completed | 6.05 s
[########################################] | 100% Completed | 6.13 s
[########################################] | 100% Completed | 6.03 ss
[########################################] | 100% Completed | 6.07 s
[########################################] | 100% Completed | 6.11 s
[########################################] | 100% Completed | 6.21 s
