In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/cars-dataset/cardekho.csv


In [2]:
import numpy as np
import pandas as pd

import warnings 

import matplotlib.pyplot as plt
import seaborn as sns

from statsmodels.graphics.gofplots import qqplot
import missingno as msno

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler

from sklearn.impute import KNNImputer

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor

from sklearn.ensemble import VotingRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [3]:
df = pd.read_csv("/kaggle/input/cars-dataset/cardekho.csv")

In [4]:
df.columns

Index(['name', 'year', 'selling_price', 'km_driven', 'fuel', 'seller_type',
       'transmission', 'owner', 'mileage(km/ltr/kg)', 'engine', 'max_power',
       'seats'],
      dtype='object')

### Statistical Analysis 

In [None]:
df.shape

In [None]:
df.sample(1) 

In [None]:
df.info() 

In [None]:
df.describe() 

In [None]:
df.isnull().mean() * 100 

In [None]:
df.duplicated().sum() 

In [None]:
# df.corr() 

In [None]:
# first of all, lets drop the duplicate values 

df.drop_duplicates(inplace = True) 

In [None]:
df.shape

In [None]:
warnings.filterwarnings('ignore') 
plt.figure(figsize = (5,10))
msno.matrix(df) 
plt.show() 

## Data Preprocessing 

In [None]:
# Custom Transformer for rounding seats column
class RoundTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X.copy()
        X[self.columns] = X[self.columns].round()
        return X

In [None]:
# Custom transformer to map car name to brand
class CarBrandTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, car_brands):
        self.car_brands = car_brands
        
    def get_car_brands(self, name):
        name_str = str(name)
        for word in name_str.split():
            if word.lower() in self.car_brands:
                return self.car_brands[word.lower()]
        return 3
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        transformed_data = []
        for name in X['name']:
            transformed_data.append([self.get_car_brands(name)])
        return np.array(transformed_data)

# Define car brand mappings
car_brands = {
    "maruti": 0,
    "kia": 3,
    "ford": 3,
    "toyota": 3,
    "volkswagen": 3,
    "skoda": 3,
    "mahindra": 2,
    "honda": 3,
    "hyundai": 1,
    "nissan": 3,
    "renault": 3,
    "tata": 2,
    "chevrolet": 3,
    "bmw": 3,
    "audi": 3,
    "mercedes": 3,
    "jaguar": 3,
    "jeep": 3,
}

In [None]:
# Custom transformer for OneHotEncoding and dropping original columns
class CustomOneHotEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
        self.encoder = OneHotEncoder(handle_unknown='ignore', drop='first', sparse_output=False)
    
    def fit(self, X, y=None):
        self.encoder.fit(X[self.columns])
        return self
    
    def transform(self, X):
        X_encoded = self.encoder.transform(X[self.columns])
        X_encoded_df = pd.DataFrame(X_encoded, columns=self.encoder.get_feature_names_out(self.columns), index=X.index)
        X_dropped = X.drop(columns=self.columns)
        X_combined = pd.concat([X_dropped, X_encoded_df], axis=1)
        return X_combined

In [None]:
df['brand'] = CarBrandTransformer(car_brands).fit_transform(df[['name']])
df.drop(columns=['name'], inplace=True)

In [None]:
ord = OrdinalEncoder(dtype = np.float32,handle_unknown = 'use_encoded_value',unknown_value = np.nan) 
df[['owner']] = ord.fit_transform(df[['owner']])

In [None]:
ohe_fuel = OneHotEncoder(handle_unknown='ignore', drop='first', sparse_output=False)
fuel_ohe_df = ohe_fuel.fit_transform(df[['fuel']])
df_fuel = pd.DataFrame(fuel_ohe_df, columns=ohe_fuel.get_feature_names_out(['fuel']))

In [None]:
ohe_transmission = OneHotEncoder(handle_unknown='ignore',drop='first',sparse_output=False) 
transmission_ohe_df = ohe_transmission.fit_transform(df[['transmission']]) 
df_transmission = pd.DataFrame(transmission_ohe_df, columns=ohe_transmission.get_feature_names_out(['transmission']))


In [None]:
ohe_seller_type = OneHotEncoder(handle_unknown='ignore',drop='first',sparse_output=False) 
seller_type_ohe_df = ohe_seller_type.fit_transform(df[['seller_type']]) 
df_seller_type = pd.DataFrame(seller_type_ohe_df, columns=ohe_seller_type.get_feature_names_out(['seller_type']))


In [None]:
df.drop(columns=['fuel'], inplace=True)
df.reset_index(drop=True, inplace=True)
df = pd.concat([df, df_fuel], axis=1)
df.shape

In [None]:
df.drop(columns = ['transmission'],inplace=True) 
df.reset_index(drop=True, inplace=True)
df = pd.concat([df,df_transmission],axis = 1) 

In [None]:
df.drop(columns = ['seller_type'],inplace=True) 
df.reset_index(drop=True, inplace=True)
df = pd.concat([df,df_seller_type],axis = 1) 
df.shape

In [None]:
df['max_power'] = pd.to_numeric(df['max_power'], errors='coerce')

In [None]:
imputer = KNNImputer(n_neighbors=5)
df = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

## Data Visualization 

In [None]:

categorical_columns = [  'owner', 'seats']

fig, axes = plt.subplots(2, 2, figsize=(12, 12))
axes = axes.flatten()

for i, column in enumerate(categorical_columns):
    df[column].value_counts().plot(kind='pie', ax=axes[i*2], autopct='%0.2f%%')
    axes[i*2].set_title(f'{column} Distribution')
    axes[i*2].set_ylabel('Distribution')
    
    sns.countplot(x=column, data=df, ax=axes[i*2+1])
    axes[i*2+1].set_title(f'{column} Counts')
    axes[i*2+1].set_ylabel('Count')
    axes[i*2+1].grid(True)

plt.tight_layout()
plt.show()

In [None]:
warnings.filterwarnings("ignore")

# Select only the numerical columns
numerical_features = df.select_dtypes(include=['number']).columns[:8]

# Define the number of rows
num_rows = len(numerical_features)

# Create the figure and axes
fig, axes = plt.subplots(num_rows, 4, figsize=(16, 4*num_rows),squeeze = True)

# Loop over each numerical feature and create plots
for i, feature in enumerate(numerical_features):
    ax1, ax2, ax3, ax4 = axes[i]
    
    # Histogram
    sns.histplot(df[feature], bins=40, ax=ax1)
    ax1.set_title(f'Histogram of {feature}')

    # KDE plot
    sns.kdeplot(df[feature], ax=ax2, fill=True, color='orange')
    mean_val = df[feature].mean()
    median_val = df[feature].median()
    ax2.axvline(mean_val, linestyle='--', color='red', label=f"Mean: {mean_val:.2f}")
    ax2.axvline(median_val, linestyle='--', color='black', label=f"Median: {median_val:.2f}")
    ax2.legend()
    ax2.set_title(f'KDE Plot of {feature}')

    # Box plot
    sns.boxplot(x=df[feature], orient='h', palette='Set2', ax=ax3)
    ax3.set_title(f'Box Plot of {feature}')

    # Q-Q plot
    qqplot(df[feature], line='s', ax=ax4)
    ax4.set_title(f'Q-Q Plot of {feature}')

plt.tight_layout()
plt.show()


## Capping the outlier for better reesults 

In [None]:
def cap_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # Cap the values using np.where
    df[column] = np.where(df[column] < lower_bound, lower_bound,
                          np.where(df[column] > upper_bound, upper_bound, df[column]))

    return df

# List of features with outliers
features_with_outliers = [
    'km_driven',
    'mileage(km/ltr/kg)',
    'engine',
    'max_power',

]

# Apply the function to each feature
for feature in features_with_outliers:
    df = cap_outliers(df, feature)

# Check the results
df.describe()


## Transforming some columns using log transformations 

In [None]:
# Log transform specific features
df['km_driven'] = np.log(df['km_driven'])
df['engine'] = np.log(df['engine'])
df['max_power'] = np.log(df['max_power'])
df['mileage(km/ltr/kg)'] = np.log(df['mileage(km/ltr/kg)'])

## Splitting the data 

In [None]:
X = df.drop(columns = ['selling_price']) 
y = df['selling_price']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=0) 

## Scaling the data  

In [None]:
scaler = StandardScaler() 
X_train = scaler.fit_transform(X_train) 
X_test = scaler.transform(X_test) 

## Voting Regressor 

In [None]:
rfr = RandomForestRegressor(max_depth=None, max_features=0.6, max_samples=1.0, n_estimators=60)

In [None]:
gbr = GradientBoostingRegressor(learning_rate=0.1, max_depth=5, max_features='log2', min_samples_leaf=1, min_samples_split=2, n_estimators=500, subsample=1.0)

In [None]:
knn = KNeighborsRegressor(algorithm='ball_tree', leaf_size=20, n_neighbors=7, weights='distance')

In [None]:
from sklearn.ensemble import VotingRegressor

voting_reg = VotingRegressor(estimators=[
    ('rf', rfr),
    ('gb', gbr),
    ('knn', knn)
])


In [None]:
voting_reg.fit(X_train, y_train)


In [None]:
y_pred = voting_reg.predict(X_test)
print(r2_score(y_test, y_pred))

In [None]:
ans = cross_val_score(voting_reg,X,y,cv = 10, scoring='r2') 
print(ans,np.mean(ans))
