In [43]:
!pip install category_encoders 
!pip install ydata-profiling



In [2]:
from sklearn.preprocessing import OneHotEncoder , LabelEncoder, OrdinalEncoder, FunctionTransformer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import FeatureHasher
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from category_encoders import BinaryEncoder, TargetEncoder
from ydata_profiling import ProfileReport
from sklearn.preprocessing import MinMaxScaler
from scipy import stats
import pandas as pd
import numpy as np

In [53]:
url ='https://drive.google.com/uc?id=1KEMfwDKxd5klTA9tFo6Nsgmi0pANIl7m'
data = pd.read_csv(url)

In [4]:
# DATA PROFILLING
profile = ProfileReport(data, title="Car Dataset Report")
profile.to_file("Car Dataset Report.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


  0%|          | 0/15 [00:00<?, ?it/s][A
  7%|▋         | 1/15 [00:00<00:05,  2.59it/s][A
 13%|█▎        | 2/15 [00:00<00:05,  2.30it/s][A
 20%|██        | 3/15 [00:01<00:03,  3.11it/s][A
 27%|██▋       | 4/15 [00:01<00:02,  4.26it/s][A
100%|██████████| 15/15 [00:01<00:00, 10.03it/s][A


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [54]:
data.head()

Unnamed: 0,year,make,model,trim,body,transmission,vin,state,condition,odometer,color,interior,seller,sellingprice,saledate
0,2013,Ford,Fusion,SE,Sedan,automatic,3fa6p0h71dr236627,wi,47.0,17548.0,white,black,lease plan usa,14200,Wed Feb 04 2015 02:30:00 GMT-0800 (PST)
1,2013,Infiniti,JX,JX35,SUV,automatic,5n1al0mn8dc329790,ca,43.0,22035.0,gray,black,infiniti of montclair,31000,Tue Jun 16 2015 05:00:00 GMT-0700 (PDT)
2,2009,Jeep,Grand Cherokee,Laredo,SUV,automatic,1j8gr48k69c529636,ne,34.0,95559.0,silver,gray,dm northwest inc,11600,Thu Jan 15 2015 03:00:00 GMT-0800 (PST)
3,2008,Ford,F-350 Super Duty,XLT,Crew Cab,automatic,1ftww31rx8ed43632,wa,43.0,135870.0,white,gray,lexus of tacoma at fife,15400,Wed Feb 11 2015 05:20:00 GMT-0800 (PST)
4,2014,Kia,Sportage,LX,SUV,automatic,kndpbcac3e7642651,nv,44.0,13604.0,white,gray,kia motors america inc,18500,Fri Mar 06 2015 04:00:00 GMT-0800 (PST)


In [55]:
# HANDLING MISSING VALUES

In [56]:
missing_val = data.isnull().sum()
print('Missing Values ')
print('--------------------')
print(missing_val)

Missing Values 
--------------------
year               0
make             953
model            963
trim             988
body            1218
transmission    5986
vin                1
state              0
condition       1077
odometer           5
color             54
interior          54
seller             0
sellingprice       0
saledate           0
dtype: int64


In [57]:
data_numeric = data.select_dtypes(include=[np.number])
categorical_col = data.drop(data_numeric, axis= 1)

In [58]:
data_numeric.isnull().sum()

year               0
condition       1077
odometer           5
sellingprice       0
dtype: int64

In [59]:
from sklearn.impute import SimpleImputer

In [60]:
# MEAN IMPUTE for NUMERICAL COLS
imputer = SimpleImputer(strategy = 'mean')
imputer.fit(data[['condition','odometer']])

# Value used for imputing
impute_value = imputer.statistics_[0]
print(f"Value used for imputing: {impute_value}")

# Apply imputation
data[['condition','odometer']] = imputer.transform(data[['condition', 'odometer']])

Value used for imputing: 30.619422357582323


In [61]:
categorical_col.isnull().sum()

make             953
model            963
trim             988
body            1218
transmission    5986
vin                1
state              0
color             54
interior          54
seller             0
saledate           0
dtype: int64

In [62]:
 # MODE IMPUTE for CATEGORICALS
imputer = SimpleImputer(strategy='most_frequent')
imputer.fit(data[['make', 'model', 'trim', 'body', 'transmission', 'vin', 'color', 'interior']])

# # Value used for imputing
print(f"Values used for imputing: {imputer.feature_names_in_}: {imputer.statistics_}")

# # Apply imputation
data[['make', 'model', 'trim', 'body', 'transmission', 'vin', 'color', 'interior']] = imputer.transform(data[['make', 'model', 'trim', 'body', 'transmission', 'vin', 'color', 'interior']])


Values used for imputing: ['make' 'model' 'trim' 'body' 'transmission' 'vin' 'color' 'interior']: ['Ford' 'Altima' 'Base' 'Sedan' 'automatic' '19xfb2f85ee234128' 'black'
 'black']


In [63]:
# Check for nulls after IMPUTING
data.isnull().sum()

year            0
make            0
model           0
trim            0
body            0
transmission    0
vin             0
state           0
condition       0
odometer        0
color           0
interior        0
seller          0
sellingprice    0
saledate        0
dtype: int64

In [64]:
# SPLIT 'TARGET' AND 'FEATURES'

In [65]:
# SCALING "SELLINGPRICE"

In [66]:
# data_Sc = data['sellingprice']
scaler = MinMaxScaler(feature_range=(0, 1))
scaled_data = scaler.fit_transform(data[['sellingprice']])  

scaled_data.shape

(50000, 1)

In [70]:
# Target
Y = data[['sellingprice']]
Y.shape

(50000, 1)

In [71]:
# features
X = data.drop(columns = 'sellingprice', axis = 1)
X.shape

(50000, 14)

In [72]:
# TRANSFORMING CATEGORICAL COLUMNS


In [73]:
categoricals  = categorical_col.columns.unique()
categoricals = categoricals.drop(['vin','seller', 'saledate', 'transmission'])
categoricals

Index(['make', 'model', 'trim', 'body', 'state', 'color', 'interior'], dtype='object')

In [74]:
# OneHotEncoder and BinaryEncoder

In [75]:
# ENCODING
preprocessor = ColumnTransformer(
    transformers = [
        ('transmission', BinaryEncoder(), ['transmission']), # HANDLING high IMBALANCE ,
        ('onehot', OneHotEncoder(handle_unknown ='ignore', sparse_output = False, min_frequency = 0.1, max_categories = 20),
         categoricals)
    ], remainder = 'drop'
)

In [76]:
#Train_test SPLIT to ENCODE
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 42)

In [77]:
# We Fit ONLY on training data, then transform both
X_train_prep = preprocessor.fit_transform(X_train)
X_test_prep= preprocessor.transform(X_test) 

In [78]:
X_train_prep = pd.DataFrame(X_train_prep)
X_test_prep = pd.DataFrame(X_test_prep)

In [79]:
y_train = pd.DataFrame(Y_train)
y_test = pd.DataFrame(Y_test)

In [81]:
y_train

Unnamed: 0,sellingprice
39087,12900
30893,13000
45278,4700
16398,13400
13653,8000
...,...
11284,7700
44732,18700
38158,35250
860,13000


In [82]:
y_test

Unnamed: 0,sellingprice
33553,11000
9427,4100
199,11900
12447,2300
39489,28100
...,...
28567,25000
25079,4600
18707,12500
15200,13500


In [83]:
if isinstance(y_train, np.ndarray):
    Y_train = pd.DataFrame(y_train, columns=['sellingprice']) 
if isinstance(Y_test, np.ndarray):
    y_test = pd.DataFrame(y_test, columns=['sellingprice'])

In [84]:
 y_test

Unnamed: 0,sellingprice
33553,11000
9427,4100
199,11900
12447,2300
39489,28100
...,...
28567,25000
25079,4600
18707,12500
15200,13500


In [87]:
# 2. Save all files
X_train_prep.to_csv('X_train_preprocessed.csv', index=False)
X_test_prep.to_csv('X_test_preprocessed.csv', index=False)
y_train.to_csv('y_train.csv', index=False)
y_test.to_csv('y_test.csv', index=False)

In [88]:
# 3. Verification step
print("Data shapes verification:")
print(f"X_train: {X_train_prep.shape} → Saved as X_train_preprocessed.csv")
print(f"X_test: {X_test_prep.shape} → Saved as X_test_preprocessed.csv")
print(f"y_train: {y_train.shape} → Saved as y_train.csv")
print(f"y_test: {y_test.shape} → Saved as y_test.csv")

print("\nData saved successfully!")

Data shapes verification:
X_train: (40000, 23) → Saved as X_train_preprocessed.csv
X_test: (10000, 23) → Saved as X_test_preprocessed.csv
y_train: (40000, 1) → Saved as y_train.csv
y_test: (10000, 1) → Saved as y_test.csv

Data saved successfully!
