In [1]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline


class CleaningSteeringType(BaseEstimator, TransformerMixin):
    def __init__(self, column='Steering Type'):
        self.column = column

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        X[self.column] = X[self.column].apply(self.clean_steering)
        return X

    def clean_steering(self, x):
        if x == 'Electrical':
            return 'Electric'
        elif x == 'power':
            return 'Power'
        elif x == 'electric':
            return 'Electric'
        else:
            return x


    

In [2]:
import pandas as pd
df = pd.read_csv('M:\Car_Dheko\cleaned_data.csv')
pd.set_option('display.max_columns', None)
print(df.shape)
df.head(2)


(8359, 34)


Unnamed: 0,Insurance Validity,Fuel Type,Engine Displacement,Year of Manufacture,ft,bt,transmission,ownerNo,oem,model,Wheel Size,Color,No of Cylinder,Fuel Suppy System,Turbo Charger,Length,Width,Height,Wheel Base,Gear Box,Drive Type,Seating Capacity,Steering Type,Tyre Type,No Door Numbers,Cargo Volumn,Mileage,Value Configuration,Super Charger,Turning Radius,City,price_in_lakhs,km_winsorized,price_in_lakhs_winzorized
0,Third Party insurance,Petrol,998 cc,2015.0,Petrol,Hatchback,Manual,3,Maruti,Maruti Celerio,15.0,White,3.0,MPFi,No,3715.0,1635.0,1565.0,2425.0,5 Speed,FWD,5.0,Power,"tubeless,radial",5.0,235.0,23.1,DOHC,No,4.7,Bangalore,4.0,11.563955,4.0
1,Comprehensive,Petrol,1497 cc,2018.0,Petrol,SUV,Manual,2,Ford,Ford Ecosport,16.0,White,3.0,Direct Injection,No,3998.0,1765.0,1647.0,2519.0,5 Speed,FWD,5.0,Power,"tubeless,radial",4.0,352.0,17.0,DOHC,No,5.3,Bangalore,8.11,10.395344,8.11


In [3]:
columns_to_drop = ['Engine Displacement','ft','model','Color','price_in_lakhs','City','bt','Gear Box']
df = df.drop(columns_to_drop, axis=1)
df.shape

(8359, 26)

In [4]:
nominal_cols_with_less_categories = ['Insurance Validity','Turbo Charger','Drive Type','Fuel Type','Steering Type','Drive Type']
nominal_cols_with_more_categories = ['Tyre Type','Value Configuration','oem']
ordered_cols = ['transmission','ownerNo','Wheel Size','No of Cylinder','Seating Capacity','No Door Numbers','Year of Manufacture']
numeric_features = ['Length','Width','Height','Wheel Base','Turning Radius','Cargo Volumn','Mileage','km_winsorized']

In [5]:
class CleanTyreType(BaseEstimator, TransformerMixin):
    def __init__(self, column='Tyre Type'):
        self.column = column
        self.tyre_dict = {  # Dictionary as an instance variable
            'tubeless,radial': 'Tubeless + Radial',
            'radial,tubeless': 'Tubeless + Radial',
            'tubeless, radial': 'Tubeless + Radial',
            'tubeless tyres,radial': 'Tubeless + Radial',
            'tubeless radials tyre': 'Tubeless + Radial',
            'tubeless radial': 'Tubeless + Radial',
            'radial,tubless': 'Tubeless + Radial',
            'tubeless. runflat': 'Tubeless + Runflat',
            'tubeless,runflat': 'Tubeless + Runflat',
            'tubeless radials tyre': 'Tubeless + Runflat',
            'runflat, radial': 'Runflat + Radial',
            'runflat': 'Runflat',
            'radial': 'Radial',
            'tubeless': 'Tubeless',
            'unknown': 'Unknown',
            'radial with tube': 'Radial with Tube',
            'tubeless, all terrain': 'Tubeless + All Terrain',
            'tubeless tyres mud terrain': 'Tubeless + Mud Terrain'
        }

    def fit(self, X, y=None):
        return self  # No fitting required, just return self

    def transform(self, X):
        X = X.copy()  # Avoid modifying original DataFrame
        X[self.column] = X[self.column].astype(str).apply(self.clean_tyre)  # Apply cleaning
        return X

    def clean_tyre(self, x):
        x = x.lower().strip()
        return self.tyre_dict.get(x, 'Other')  # Return cleaned value or 'Other' if not found


In [6]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder
from category_encoders import TargetEncoder
from sklearn.preprocessing import OneHotEncoder
import xgboost as xgb

In [7]:
X = df.drop('price_in_lakhs_winzorized',axis=1)
y = df['price_in_lakhs_winzorized']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

X_test.shape


(2090, 25)

In [8]:
# Define your column transformer
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer([
    ('numeric', 'passthrough', numeric_features),
    ('ordinal',OrdinalEncoder(), ordered_cols),
    ('target_encoder', TargetEncoder(), nominal_cols_with_more_categories),
    ('one_hot_encoder', OneHotEncoder(), nominal_cols_with_less_categories),
    
])

In [9]:
full_pipeline = Pipeline([
    ('clean_steering', CleaningSteeringType()),
    ('clean_tyre', CleanTyreType()),
    ('preprocessing', preprocessor),
    ('scaler', StandardScaler()),
    ('model',xgb.XGBRegressor())
])

In [10]:
full_pipeline.fit(X_train, y_train)

In [98]:
X_test.apply(lambda col: col.astype(str).str.contains('8-Speed DCT', case=False, na=False)).any().any()


np.True_

In [111]:
mask = df.apply(lambda col: col.astype(str).str.contains('electric', case=False, na=False))
df[mask.any(axis=1)]  # Rows where 'wagon' appears in any column


Unnamed: 0,Insurance Validity,Fuel Type,Year of Manufacture,transmission,ownerNo,oem,Wheel Size,No of Cylinder,Fuel Suppy System,Turbo Charger,Length,Width,Height,Wheel Base,Drive Type,Seating Capacity,Steering Type,Tyre Type,No Door Numbers,Cargo Volumn,Mileage,Value Configuration,Super Charger,Turning Radius,km_winsorized,price_in_lakhs_winzorized
18,Third Party insurance,Petrol,2022.0,Manual,1,Tata,17.0,3.0,Unknown,No,3993.0,1811.0,1606.0,2498.0,FWD,5.0,Electric,"tubeless,radial",5.0,350.0,17.33,DOHC,Unknown,5.100000,9.702973,10.05
21,Comprehensive,Petrol,2016.0,Manual,1,Maruti,16.0,4.0,Unknown,Unknown,3845.0,1735.0,1530.0,2450.0,FWD,5.0,Electric,"radial,tubeless",5.0,268.0,22.38,DOHC,Unknown,4.800000,9.802783,5.82
28,Comprehensive,Petrol,2017.0,Automatic,1,Mercedes-Benz,17.0,4.0,Direct Injection,Yes,5063.0,1860.0,1494.0,3079.0,RWD,5.0,Electric,"tubeless,radial",4.0,540.0,15.00,DOHC,No,6.000000,10.404293,18.41
48,Comprehensive,Petrol,2016.0,Automatic,1,Mini,16.0,4.0,MPFi,Yes,3850.0,1727.0,1414.0,2495.0,2WD,4.0,electric,runflat,3.0,211.0,17.33,Unknown,No,5.400000,10.596660,18.41
50,Comprehensive,Petrol,2020.0,Automatic,2,Kia,17.0,4.0,GDi,Yes,4315.0,1800.0,1645.0,2610.0,FWD,5.0,Electric,"tubeless,radial",5.0,433.0,16.50,DOHC,Unknown,8.614012,10.420494,16.75
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8333,Third Party insurance,Petrol,2022.0,Automatic,1,Kia,18.0,4.0,GDi,Yes,4315.0,1800.0,1645.0,2610.0,FWD,5.0,Electric,"tubeless,radial",5.0,433.0,16.50,DOHC,Unknown,8.614012,9.702973,18.25
8334,Third Party insurance,Petrol,2019.0,Manual,1,Tata,15.0,3.0,Unknown,Unknown,3802.0,1677.0,1537.0,2400.0,FWD,5.0,Electric,"tubeless,radial",5.0,242.0,20.09,Unknown,Unknown,2.995405,9.903538,5.00
8342,Third Party insurance,Petrol,2023.0,Manual,1,Honda,15.0,4.0,Unknown,Unknown,4574.0,1748.0,1489.0,2600.0,FWD,5.0,Electric,"tubeless,radial",4.0,506.0,17.80,DOHC,Unknown,5.300000,9.903538,12.00
8346,Third Party insurance,Petrol,2022.0,Manual,1,Tata,17.0,3.0,Unknown,Unknown,3993.0,1811.0,1606.0,2498.0,FWD,5.0,Electric,"tubeless,radial",5.0,350.0,17.33,DOHC,Unknown,5.100000,9.903538,6.74


In [87]:
X_test['Tyre Type'].value_counts()

Tyre Type
tubeless,radial               1505
tubeless                       382
radial,tubeless                 66
radial                          28
tubeless, radial                25
unknown                         21
radial,tubless                  20
runflat                         15
tubeless tyres,radial           14
tubless,radial                   6
tubeless,runflat                 4
tubeless. runflat                1
tubeless radial                  1
tubeless radials tyre            1
tubeless tyres mud terrain       1
Name: count, dtype: int64

In [13]:
X_test.columns

Index(['Insurance Validity', 'Fuel Type', 'Year of Manufacture',
       'transmission', 'ownerNo', 'oem', 'Wheel Size', 'No of Cylinder',
       'Fuel Suppy System', 'Turbo Charger', 'Length', 'Width', 'Height',
       'Wheel Base', 'Drive Type', 'Seating Capacity', 'Steering Type',
       'Tyre Type', 'No Door Numbers', 'Cargo Volumn', 'Mileage',
       'Value Configuration', 'Super Charger', 'Turning Radius',
       'km_winsorized'],
      dtype='object')

In [11]:
y_preds = full_pipeline.predict(X_test)
y_preds

array([8.061466 , 3.7522624, 6.1459374, ..., 7.0887465, 2.9691565,
       7.1064854], shape=(2090,), dtype=float32)

In [12]:
import pickle
with open('car_pipeline.pkl', 'wb') as f:
    pickle.dump(full_pipeline, f)

In [14]:
future_df = X_test.sample(10)
future_df.to_csv('future_df.csv')