# Import libraries

In [None]:
# Import libraries
import numpy as np
import matplotlib.pyplot as mtp
import pandas as pd
import seaborn as sb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [None]:
# Loading dataset into data_set variable
data_set = pd.read_csv('./Data/vehicle_data.csv')

In [None]:
# Previewing data set column and row count
data_set.shape

In [None]:
# Preview data
data_set.head(5)

In [None]:
#Get summery of the data
data_set.describe()

In [None]:
# Check null values count
data_set.isnull().sum()

# Preprocess the dataset 

In [None]:
# Select only SUV and SUV/4x4 from 'Body' column
SUV = data_set[(data_set['Body'] == 'SUV') | (data_set['Body'] == 'SUV / 4x4')]

Feature selection

In [None]:
# Drop unwanted columns for price prediction

SUV = SUV.drop(['Sub_title','Location','Post_URL','Seller_type','published_date','Body','Seller_name','Edition','Description','Condition','Title'], axis = 'columns')

In [None]:
# View data after drop columns
SUV.head(15)

In [None]:
# View the data types in columns
SUV.info()

In [None]:
# Replace non numeric characters in 'Price' column

SUV['Price'] =SUV['Price'].str.replace('Rs.','')
SUV['Price'] =SUV['Price'].str.replace(',','')

In [None]:
# Convert to numeric values
SUV['Price'] = pd.to_numeric(SUV['Price'], errors='coerce')

# Remove rows with NaN values for 'Price' column
SUV.dropna(subset=['Price'], inplace=True)

SUV['Price'] = SUV['Price'].astype('int64')

In [None]:
# Remove non-numeric values in 'Capacity' column

SUV['Capacity'] =SUV['Capacity'].str.replace('[^0-9]', '')

In [None]:
# Convert to numeric values
SUV['Capacity'] = pd.to_numeric(SUV['Capacity'], errors='coerce')

# Remove rows with NaN values for 'Capacity' column
SUV.dropna(subset=['Capacity'], inplace=True)
SUV['Capacity'] = SUV['Capacity'].astype('int64')

In [None]:
#Remove records less than 990 and more than 6000 from capacity column

SUV=SUV[(SUV['Capacity'] >= 990) & (SUV['Capacity'] <= 6000)]

In [None]:
# Replace non numeric characters in 'Mileage' column

SUV['Mileage'] =SUV['Mileage'].str.replace('km','')
SUV['Mileage'] =SUV['Mileage'].str.replace(',','')

In [None]:
# Convert to numeric values
SUV['Mileage'] = pd.to_numeric(SUV['Mileage'], errors='coerce')

# Remove rows with NaN values for 'Mileage' column
SUV.dropna(subset=['Mileage'], inplace=True)

SUV['Mileage'] = SUV['Mileage'].astype('int64')

In [None]:
# Remove possible error values
values_to_remove = [12345,11111]

# Remove rows with specific values in 'Mileage' column
SUV = SUV[~SUV['Mileage'].isin(values_to_remove)]

# Remove rows with values greater than 500000 and less than 10000 in 'Mileage' column
SUV= SUV[SUV['Mileage'] <= 500000]
SUV = SUV[SUV['Mileage'] >= 10000]

In [None]:
# View unique brands that are available in the data set
SUV['Brand'].unique()

In [None]:
# Categorize the same brand into one category
SUV['Brand'] =SUV['Brand'].str.replace('Range-Rover','Land Rover')
SUV['Brand'] =SUV['Brand'].str.replace('Land-Rover','Land Rover')
SUV['Brand'] =SUV['Brand'].str.replace('Mercedes-Benz','Mercedes Benz')
SUV['Brand'] =SUV['Brand'].str.replace('Ssangyong','Ssang Yong')

In [None]:
# Get different brand counts
brand_counts = SUV['Brand'].value_counts()

# Get the brands with at least 10 records
brands_to_keep = brand_counts[brand_counts >= 10].index

# Filter the DataFrame to only include rows with brands to keep
SUV = SUV[SUV['Brand'].isin(brands_to_keep)]

# Rename all brands which have less than 70 records, as 'Other brand'
SUV['Brand'] = SUV['Brand'].replace(brand_counts[brand_counts < 70].index, 'Other Brand')

In [None]:
# Identify the unique values in 'Brand' column
SUV['Brand'].value_counts()

In [None]:
# Further categorize the brand

SUV.loc[SUV['Brand'].str.contains('Daihatsu|Isuzu|Suzuki'), 'Brand'] = 'Other Japanese'
SUV.loc[SUV['Brand'].str.contains('Audi|MG|Jeep|Ford|Peugeot'), 'Brand'] = 'Other Europe'
SUV.loc[SUV['Brand'].str.contains('Kia|Ssang Yong'), 'Brand'] = 'Other Korean'
SUV.loc[SUV['Brand'].str.contains('DFSK|Mahindra|Tata'), 'Brand'] = 'Indian'

In [None]:
# Categorize the models 
SUV.loc[SUV['Brand'] == 'Other Brand', 'Model'] = 'Other Model'
SUV.loc[SUV['Brand'] == 'Other Japanese', 'Model'] = 'Other Japanese'
SUV.loc[SUV['Brand'] == 'Other Europe', 'Model'] = 'Other Europe'
SUV.loc[SUV['Brand'] == 'Other Korean', 'Model'] = 'Other Korean'
SUV.loc[SUV['Brand'] == 'Indian', 'Model'] = 'Indian'

In [None]:
#Identify unique "Models" that available
SUV['Model'].unique()

In [None]:
# Do the segmentation for models

SUV.loc[SUV['Model'].str.contains('Prado|PRADO|Land Cruiser|LAND CRUISER|Land cruiser'), 'Model'] = 'Land Cruiser Prado'
SUV.loc[SUV['Model'].str.contains('Vezel',case=False), 'Model'] = 'Vezel'
SUV.loc[SUV['Model'].str.contains('Montero|MONTERO|montero'), 'Model'] = 'Montero'
SUV.loc[SUV['Model'].str.contains('Defender',case=False), 'Model'] = 'Defender'
SUV.loc[SUV['Model'].str.contains('Discovery',case=False), 'Model'] = 'Discovery'
SUV.loc[SUV['Model'].str.contains('Outlander',case=False), 'Model'] = 'Outlander'
SUV.loc[SUV['Model'].str.contains('Pajero|PAJERO'), 'Model'] = 'Pajero'
SUV.loc[SUV['Model'].str.contains('Range Rover|RANGE ROVER|velar|Velar'), 'Model'] = 'Range Rover'
SUV.loc[SUV['Model'].str.contains('CHR|Chr|ChR'), 'Model'] = 'CHR'
SUV.loc[SUV['Model'].str.contains('CRV|C-RV|Crv'), 'Model'] = 'CRV'
SUV.loc[SUV['Model'].str.contains('X-Trail|X trail|X-trail|X Trail|XTrail|X TRAIL|X Trail|X-TRAIL'), 'Model'] = 'X-Trail'
SUV.loc[SUV['Model'].str.contains('X1|x1'), 'Model'] = 'X1'
SUV.loc[SUV['Model'].str.contains('X3|x3'), 'Model'] = 'X3'
SUV.loc[SUV['Model'].str.contains('X5|x5'), 'Model'] = 'X5'
SUV.loc[SUV['Model'].str.contains('Harrier',case=False), 'Model'] = 'Harrier'
SUV.loc[SUV['Model'].str.contains('4Dr|4DR|4dr|4dR'), 'Model'] = '4DR'
SUV.loc[SUV['Model'].str.contains('Eclipse',case=False), 'Model'] = 'Eclipse'
SUV.loc[SUV['Model'].str.contains('Tucson|TUCSON'), 'Model'] = 'Tucson'
SUV.loc[SUV['Model'].str.contains('Rexton|REXTON|Ssangyong|Kyron'), 'Model'] = 'Rexton'
SUV.loc[SUV['Model'].str.contains('Sorento',case=False), 'Model'] = 'Sorento'
SUV.loc[SUV['Model'].str.contains('Sportage',case=False), 'Model'] = 'Sportage'
SUV.loc[SUV['Model'].str.contains('Escudo',case=False), 'Model'] = 'Escudo'
SUV.loc[SUV['Model'].str.contains('Rush',case=False), 'Model'] = 'Rush'
SUV.loc[SUV['Model'].str.contains('Rav',case=False), 'Model'] = 'Rav'
SUV.loc[SUV['Model'].str.contains('Korando',case=False), 'Model'] = 'Korando'
SUV.loc[SUV['Model'].str.contains('Fortuner',case=False), 'Model'] = 'Fortuner'
SUV.loc[SUV['Model'].str.contains('Hrv|H-RV|HR-V|HRV',case=False), 'Model'] = 'HRV'
SUV.loc[SUV['Model'].str.contains('santa fe',case=False), 'Model'] = 'Santa Fe'
SUV.loc[SUV['Model'].str.contains('jimny',case=False), 'Model'] = 'Jimny'
SUV.loc[SUV['Model'].str.contains('jeep',case=False), 'Model'] = 'Jeep'
SUV.loc[SUV['Model'].str.contains('zs',case=False), 'Model'] = 'ZS'
SUV.loc[SUV['Model'].str.contains('juke',case=False), 'Model'] = 'Juke'
SUV.loc[SUV['Model'].str.contains('patrol',case=False), 'Model'] = 'Patrol'
SUV.loc[SUV['Model'].str.contains('raize',case=False), 'Model'] = 'Raize'
SUV.loc[SUV['Model'].str.contains('gypsy',case=False), 'Model'] = 'Gypsy'
SUV.loc[SUV['Model'].str.contains('cami',case=False), 'Model'] = 'Cami'


In [None]:
#Get the count for models
model_counts = SUV['Model'].value_counts()

# Get the model with at least 30 records
models_to_keep = model_counts[model_counts >= 30].index
print (models_to_keep)

# Filter the DataFrame to only include rows with models to keep
SUV = SUV[SUV['Model'].isin(models_to_keep)]

In [None]:
# Group the data by the 'Model' column and count the frequency of each model
model_count = SUV.groupby('Model')['Model'].count().sort_values(ascending=False)

print(model_count)
len(model_count)

In [None]:
# View the data types after changing them
SUV.info()

In [None]:
# Check different types and their value counts for 'Transmission' and 'Fuel' columns
SUV['Transmission'].value_counts()

In [None]:
SUV['Fuel'].value_counts()

In [None]:
# Do the label encoding for 'Transmission and 'Fuel' columns
transmission_l = LabelEncoder()
fuel_l = LabelEncoder()

SUV['Transmission_N'] = transmission_l.fit_transform(SUV['Transmission'])
SUV['Fuel_N'] = fuel_l.fit_transform(SUV['Fuel'])

In [None]:
# Check the correlation 
SUV.corr()['Price']

In [None]:
# Drop unwanted columns for price prediction as they have very small correlation
SUV = SUV.drop(['Transmission','Fuel','Fuel_N','Transmission_N'], axis = 'columns')

In [None]:
# View the shape of the data set
SUV.shape

# Encoding

In [None]:
# Do the one-hot encoding
SUV=pd.get_dummies(SUV)

In [None]:
# Preview data
SUV.head(50)

In [None]:
# View the shape of dataset after encoding
SUV.shape

# Model Building

In [None]:
# Select columns for X and Y axis
x= SUV.drop('Price',axis=1)
y= SUV['Price']

# Devide train and test data set
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2)
x_train.shape, x_test.shape

In [None]:
# Function to check the model accuracy

def model_acc(model):
    model.fit(x_train,y_train)
    accuracy=model.score(x_test,y_test)
    print(str(model)+ '---->' + str(accuracy))

### Linear Regression


In [None]:
from sklearn.linear_model import LinearRegression
lr=LinearRegression()
model_acc(lr)

### Lasso Regression


In [None]:
from sklearn.linear_model import Lasso
ls=Lasso()
model_acc(ls)

### Decision Tree Regressor

In [None]:
from sklearn.tree import DecisionTreeRegressor
dt=DecisionTreeRegressor()
model_acc(dt)

### Random Forest Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf=RandomForestRegressor()
model_acc(rf)

# Hyperparameter tunning

## Linear Regression


In [None]:
from sklearn.model_selection import GridSearchCV

parameters = {'fit_intercept': [True, False],
              'n_jobs': [-1, 1, 2]}

grid_obj_lr = GridSearchCV(estimator=lr, param_grid=parameters)
grid_fit_lr = grid_obj_lr.fit(x_train, y_train)

In [None]:
best_model_lr = grid_fit_lr.best_estimator_
best_model_lr.score(x_test, y_test)

## Decison Tree Regression 

In [None]:
from sklearn.model_selection import GridSearchCV


# specify hyperparameters and values to search
params = {'max_depth': [2, 4, 6, 8, 10],
          'min_samples_split': [2, 5, 10, 15, 20],
          'min_samples_leaf': [1, 2, 4, 8, 16]}

# create GridSearchCV object and fit to data
grid_obj_dt = GridSearchCV(estimator=dt, param_grid=params, cv=5, scoring='neg_mean_squared_error')
grid_fit_dt = grid_obj_dt.fit(x_train, y_train)


In [None]:
best_model_dt = grid_fit_dt.best_estimator_
best_model_dt.score(x_test, y_test)

## Random Forest Regression

In [None]:

from sklearn.model_selection import GridSearchCV

parameters = {'n_estimators':[10, 50, 100],
              'criterion':['squared_error','absolute_error','poisson']}

grid_obj_rf = GridSearchCV(estimator=rf, param_grid=parameters)
grid_fit_rf = grid_obj_rf.fit(x_train, y_train)

In [None]:

best_model_rf = grid_fit_rf.best_estimator_
best_model_rf.score(x_test, y_test)

# Predict

## Random Forest Regression

In [None]:
predict_price_rf = best_model_rf.predict(x_test)
predict_price_rf

In [None]:
from sklearn.metrics import r2_score


train_accuracy_rf = r2_score(y_test, predict_price_rf)
train_accuracy_rf