In [1]:
import sys
import os
sys.path.append(os.path.join(os.getcwd(), '../'))
from utility.print_summary import print_summary

In [2]:
import pandas as pd
import numpy as np




In [3]:
PROJECT_ID=os.getenv('PROJECT_ID')
BUCKET_NAME=os.getenv('BUCKET_NAME')


In [4]:
from cloud.gcs_storage_operations import GCSDataOperations

gcs = GCSDataOperations(PROJECT_ID) 
 

INFO:cloud.gcs_storage_operations:GCS Data client initialized for project: dealiq-465722


In [5]:

df = gcs.read_parquet(BUCKET_NAME, "raw_data.parquet")


df.head()

INFO:cloud.gcs_storage_operations:Parquet read: dealiq_1/raw_data.parquet (426880 rows, 26 columns)


Unnamed: 0,id,url,region,region_url,price,year,manufacturer,model,condition,cylinders,...,size,type,paint_color,image_url,description,county,state,lat,long,posting_date
0,7222695916,https://prescott.craigslist.org/cto/d/prescott...,prescott,https://prescott.craigslist.org,6000,,,,,,...,,,,,,,az,,,
1,7218891961,https://fayar.craigslist.org/ctd/d/bentonville...,fayetteville,https://fayar.craigslist.org,11900,,,,,,...,,,,,,,ar,,,
2,7221797935,https://keys.craigslist.org/cto/d/summerland-k...,florida keys,https://keys.craigslist.org,21000,,,,,,...,,,,,,,fl,,,
3,7222270760,https://worcester.craigslist.org/cto/d/west-br...,worcester / central MA,https://worcester.craigslist.org,1500,,,,,,...,,,,,,,ma,,,
4,7210384030,https://greensboro.craigslist.org/cto/d/trinit...,greensboro,https://greensboro.craigslist.org,4900,,,,,,...,,,,,,,nc,,,


# standardizing 

In [None]:
#  standardization  and extracting info from model and description.
from DataCleaning.data_model import process_car_dataset

featured_eng=process_car_dataset(df)


In [None]:
# dropping  unnecessory columns
from DataCleaning.data_cleaning import drop_unnecessary_columns , drop_rows_with_few_missing_values

df, summary = drop_unnecessary_columns(df) 
print_summary(summary)


In [None]:
# dropping rows due to high NAs 

df, summary = df_cleaned, summary = drop_rows_with_few_missing_values(df) 

print_summary(summary)

# Filling missing values 

### Title_status

In [None]:
# filling missing values in title status with 'missing' 
from DataCleaning.data_title_status import fill_missing_values 

df,summary = fill_missing_values(df) 
print_summary(summary)

###  Transmission

In [None]:
from DataCleaning.data_transmission import fill_missing_values_transmission, convert_transmission_to_automatic 


df,summary = fill_missing_values_transmission(df) 
print_summary(summary)
print_summary(summary)

In [None]:
df, summary = convert_transmission_to_automatic(df)

print_summary(summary)

# drive column cleaning 

In [9]:
# standardasition 
from DataCleaning.data_drive import clean_drive_column
df,summary=clean_drive_column(df, 'drive')  


In [None]:
# filling null values from research : (na values)
from DataCleaning.data_drive import fill_missing_drive_from_reference


df,summary = fill_missing_drive_from_reference(df,
                                       reference_file='/Users/dhruvpatel/Desktop/projects/DealPredection/data/models_with_drive.csv'
                                                 )

print_summary(summary)

# Model 

Clean in stages:
* Stage 1: Remove obvious junk (numbers, too short, too long)
* Stage 2: Extract core model from complex strings
* Stage 3: Standardize spelling and format
* Stage 4: Apply manufacturer-specific rules

In [None]:
# Stage 1 : Remove obvious junk (only numbers, too short, too long)

from DataCleaning.data_model import remove_numerical_models

df,summary =remove_numerical_models(df)

print_summary(summary)

In [None]:
df['manufacturer']

In [None]:
from DataCleaning.data_model import clean_models_with_list_optimized
 
df,summary = clean_models_with_list_optimized(df)  

print_summary(summary)

In [None]:
df.isnull().sum()

In [15]:
# clean model 
from DataCleaning.data_model import filter_by_value_counts

df = filter_by_value_counts(df, 'model', min_count=10) 

# Type

In [None]:
from DataCleaning.data_type import drop_na_drive_type  

df, summary = drop_na_drive_type(df)
print_summary(summary)

In [None]:
# cleaning and standardasition  (type) 
from DataCleaning.data_type import replace_values 
df, summary =replace_values(df, 'type', {'mini van': 'minivan', 'mini-van': 'minivan'})
print_summary(summary)


In [None]:
# filling null values based on data present. 
# First, let's see what we're working with

from DataCleaning.data_type import fill_type_from_model

df, summary = fill_type_from_model(df)

print_summary(summary)

In [None]:
# finally drop type nulls 
from DataCleaning.data_type import drop_na_type

df_clean, summary =drop_na_type(df)

print_summary(summary)

# Drive

In [None]:
# impute drive 1 ( based on cross tab type )

from DataCleaning.data_drive import impute_drive_from_type

df, summary = impute_drive_from_type(df)
print_summary(summary)


### Manufacturer 

In [None]:
from DataCleaning.data_manufacturers import standardize_manufacturer 

df, summary =standardize_manufacturer(df)
print_summary(summary)

### Paint Color 

In [None]:
from DataCleaning.data_paint_color import fill_paint_color_nulls 

# Usage
df, summary = fill_paint_color_nulls(df)

print_summary(summary)

# census_region

In [None]:
from  DataCleaning.data_census_region import add_census_divisions_abbrev , validate_regions

# Usage
df, summary = add_census_divisions_abbrev(df)

print_summary(summary)

# Price

In [None]:
from DataCleaning.data_price import clean_price_data 
# Usage example:
df, summary = clean_price_data(df, 'price')
print_summary(summary)

# Fuel

In [None]:
from DataCleaning.data_fuel import  convert_fuel_to_gas


df,summary = convert_fuel_to_gas(df)
print_summary(summary)

# odometer 

In [None]:
# Import the functions
from DataCleaning.data_odometer import process_odometer_column



# Clean the data (recommended approach)
df, summary = process_odometer_column(df, 'odometer')

print_summary(summary)


# validation columns 

1. census_region ✅ 
2. drive ✅
3. fuel ✅
4. lat  
5. long
6. manufacturer ✅ 
7. model ✅ 
8.  ✅
9. paint_color ✅
10. price ✅
11. state ✅
12. title_status ✅
13. transmission ✅
14. type ✅
15. year ✅

In [None]:
# 1 census_region 
df, summary = validate_regions(df) 

print_summary(summary)

In [None]:
# 2 
from DataCleaning.data_year import validate_years
# Usage
df, summary = validate_years(df, year_column='year', min_year=1990)

print_summary(summary)

In [None]:
# 3
# transmission column 

from DataCleaning.data_transmission import  validate_transmission_values 

df, validation_summary = validate_transmission_values(df)

print_summary(summary)

In [None]:
# 4 
from DataCleaning.data_fuel import validate_fuel_values
df , summary = validate_fuel_values(df) 
print_summary(summary)

In [None]:
# 5 
from DataCleaning.data_title_status import validate_title_status_values 

df, summary = validate_title_status_values(df)

print_summary(summary)

In [None]:
# 6 
from DataCleaning.data_type import validate_type_values
df, summary = validate_type_values(df, standardize_case=True)

print_summary(summary)

In [None]:
# 7
# Usage

from DataCleaning.data_manufacturers import  validate_manufacturers 
df, summary = validate_manufacturers(df)
print_summary(summary)

In [None]:
# 8 
from DataCleaning.data_paint_color import validate_paint_color
df, summary = validate_paint_color(df)

print_summary(summary)

In [None]:
# 9 
from DataCleaning.data_state import validate_state
df, summary = validate_state(df)

print_summary(summary)

In [None]:
# 10 model 
from DataCleaning.data_model import validate_model_frequency

df_clean, summary = validate_model_frequency(df, min_count=10)
print_summary(summary)

In [None]:
# 11 Drive  
from DataCleaning.data_drive import validate_drive_values
df, summary = validate_drive_values(df)
print_summary(summary)

In [None]:
# 13
from DataCleaning.data_odometer import validate_odometer 

df, summary = validate_odometer(df, min_miles=0, max_miles=500000)
print_summary(summary)

In [None]:
from DataCleaning.data_lat_long import validate_usa_coordinates
df, summary = validate_usa_coordinates(df)
print_summary(summary)

In [None]:
df.isna().sum()