In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import warnings

warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('/kaggle/input/cardekho-used-car-data/cardekho_dataset.csv', index_col=[0])

In [None]:
df.head()

## Data cleaning

### Handling Missing Values

- Understand the dataset
- Handling Missing values and Duplicates
- check data type

In [None]:
# check null values
df.isnull().sum()

In [None]:
# Remove unnecessary columns
df.drop('car_name', axis=1, inplace=True)
df.drop('brand', axis=1, inplace=True)

In [None]:
df.head()

In [None]:
df['model'].unique()

In [None]:
#numerical features
numeric_features = [feature for feature in df.columns if df[feature].dtype!='O']
print(f'Number of numerical features : {len(numeric_features)}')

In [None]:
#categorical features
categorical_features = [feature for feature in df.columns if df[feature].dtype =='O']
print(f'Number of categorical features : {len(categorical_features)}')

In [None]:
# Discrete features
discrete_features = [feature for feature in numeric_features if len(df[feature].unique())<=25]
print(f'Number of discrete features : {len(discrete_features)}')

In [None]:
#continuous features
continuous_features = [feature for feature in numeric_features if feature not in discrete_features]
print(f'Number of continuous features : {len(continuous_features)}')

In [None]:
# Dependent and independent features
x=df.drop(['selling_price'], axis=1)
y=df['selling_price']

In [None]:
y.head(50)

In [None]:
x.isnull().sum()

In [None]:
x.head()

In [None]:
len(df['model'].value_counts())

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.compose import ColumnTransformer
le=LabelEncoder()
x['model']=le.fit_transform(x['model'])

In [None]:
x.head()

In [None]:
len(df['seller_type'].value_counts()),len(df['transmission_type'].value_counts()),len(df['fuel_type'].value_counts())

In [None]:
# create column tranformer
from sklearn.compose import ColumnTransformer
from sklearn.discriminant_analysis import StandardScaler
from sklearn.preprocessing import OneHotEncoder


num_features = x.select_dtypes(exclude='object').columns
onehot_columns = ['seller_type','transmission_type','fuel_type']

numeric_transformer=StandardScaler()
oh_transformer=OneHotEncoder(drop='first')

preprocessor = ColumnTransformer(
    [
        ("oneHotEncoder",oh_transformer,onehot_columns),
        ("standardScaler",numeric_transformer,num_features)
    ],remainder='passthrough')

In [None]:
from sklearn.model_selection import train_test_split


In [None]:
x=preprocessor.fit_transform(x)

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)
x_train.shape,x_test.shape

## Model Training 

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRFRegressor
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error,classification_report

In [None]:
# creating a function to evaluate the model
def evaluation(true, predicted):
    mae= mean_absolute_error(true,predicted)
    mse= mean_squared_error(true,predicted)
    rmse=np.sqrt(mean_squared_error(true,predicted))
    r2_square=r2_score(true, predicted)
    return mae,rmse,r2_square

In [None]:
# Model training
models= {
    'Linear Regression' : LinearRegression(),
    'Ridge': Ridge(),
    'Lasso': Lasso(),
    'K-Neighbour Regression':KNeighborsRegressor(),
    'Random Forest Regressor':RandomForestRegressor(),
    'Decision Tree Regression':DecisionTreeRegressor(),
    'Adaboost Regressor': AdaBoostRegressor(),
    'Gradient boost regressor':GradientBoostingRegressor(),
    'XGboost regressor':XGBRFRegressor()
}

for i in range (len(list(models))):
    model = list(models.values())[i]
    model.fit(x_train, y_train)

    # For prediction
    y_train_pred = model.predict(x_train)
    y_test_pred = model.predict(x_test)

    #evaluate train and test dataset
    model_train_mae ,model_train_rmse, model_train_r2 = evaluation(y_train, y_train_pred)
    
    model_test_mae ,model_test_rmse, model_test_r2 = evaluation(y_test, y_test_pred)

    print(list(models.keys())[i])

    print('model performance for training dataset')
    print('- Root mean squared error: {:.3f}'.format(model_train_rmse))
    print('- Mean absolute error: {:.3f}'.format(model_train_mae))
    print('- R2 score: {: .3f}'.format(model_train_r2))

    print('===============================')

    print('model performance for test dataset')
    print('- Root mean squared error: {:.3f}'.format(model_test_rmse))
    print('- Mean absolute error: {:.3f}'.format(model_test_mae))
    print('- R2 score: {:.3f}'.format(model_test_r2))

    print('_'*35)
    print('\n')

    

In [None]:
## Hyperparameter tuning
rf_params = {'max_depth':[5,8,15,None,10],
             'max_features':[5,7,'auto',8],
             'min_samples_split':[2,9,16,22],
             'n_estimators':[100,200,500,800,1000]}

gradient_params={'loss':['squared_error','huber','absolute_error'],
                 'criterion':['squared_error','mse','friedman_mse'],
                 'min_samples_split':[2,10,15,20],
                 'n_estimators':[100,200,500,1000],
                 'max_depth':[None,5,8,10,15],
                 'learning_rate':[0.1,0.01,0.02,0.03]
                 }


In [None]:
# Models list for hyperparameter tuning
randomcv_models = [('RF', RandomForestRegressor(), rf_params),
                   ('gadient',GradientBoostingRegressor(), gradient_params)
                   ]

In [None]:
from sklearn.model_selection import RandomizedSearchCV

model_param = {}
for name, model, params in randomcv_models:
    random = RandomizedSearchCV(estimator=model,
                                param_distributions=params,
                                n_iter = 50,
                                cv=3,
                                verbose=2,
                                n_jobs=-1)
    random.fit(x_train, y_train)
    model_param[name] = random.best_params_

for model_name in model_param:
    print(f'============= Best params for {model_name} ==============')
    print(model_param[model_name])

In [None]:
# Model training
models_hyper= {
    'Random Forest Regressor':RandomForestRegressor(n_estimators=1000, min_samples_split=2, max_features=8, max_depth=None),
    'geadientboost regressor':GradientBoostingRegressor(n_estimators=1000,loss='huber',min_samples_split=2,max_depth=8,learning_rate=0.03,criterion='squared_error')
    
}

for i in range (len(list(models_hyper))):
    model = list(models_hyper.values())[i]
    model.fit(x_train, y_train)

    # For prediction
    y_train_pred = model.predict(x_train)
    y_test_pred = model.predict(x_test)

    #evaluate train and test dataset
    model_train_mae ,model_train_rmse, model_train_r2 = evaluation(y_train, y_train_pred)
    
    model_test_mae ,model_test_rmse, model_test_r2 = evaluation(y_test, y_test_pred)

    print(list(models_hyper.keys())[i])

    print('model performance for training dataset')
    print('- Root mean squared error: {:.3f}'.format(model_train_rmse))
    print('- Mean absolute error: {:.3f}'.format(model_train_mae))
    print('- R2 score: {: .3f}'.format(model_train_r2))

    print('===============================')

    print('model performance for test dataset')
    print('- Root mean squared error: {:.3f}'.format(model_test_rmse))
    print('- Mean absolute error: {:.3f}'.format(model_test_mae))
    print('- R2 score: {:.3f}'.format(model_test_r2))

    print('_'*35)
    print('\n')