# Import libraries

In [None]:
# Import libraries
import numpy as np
import matplotlib.pyplot as mtp
import pandas as pd
import seaborn as sb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [None]:
# Loading dataset into data_set variable
data_set = pd.read_csv('./Data/vehicle_data.csv')

In [None]:
# Previewing data set column and row count
data_set.shape

In [None]:
# Preview data
data_set.head(5)

In [None]:
#Get summery of the data
data_set.describe()

In [None]:
# Check null values count
data_set.isnull().sum()

# Preprocess the dataset 

In [None]:
# Select only SUV and SUV/4x4 from 'Body' column
SUV = data_set[(data_set['Body'] == 'SUV') | (data_set['Body'] == 'SUV / 4x4')]

Feature selection

In [None]:
# Drop unwanted columns for price prediction

SUV = SUV.drop(['Sub_title','Location','Post_URL','Seller_type','published_date','Body','Seller_name','Edition','Description','Condition','Title'], axis = 'columns')

In [None]:
# View data after drop columns
SUV.head(15)

In [None]:
# View the data types in columns
SUV.info()

In [None]:
# Replace non numeric characters in 'Price' column

SUV['Price'] =SUV['Price'].str.replace('Rs.','')
SUV['Price'] =SUV['Price'].str.replace(',','')

In [None]:
# Convert to numeric values
SUV['Price'] = pd.to_numeric(SUV['Price'], errors='coerce')

# Remove rows with NaN values for 'Price' column
SUV.dropna(subset=['Price'], inplace=True)

SUV['Price'] = SUV['Price'].astype('int64')

In [None]:
# Remove non-numeric values in 'Capacity' column

SUV['Capacity'] =SUV['Capacity'].str.replace('[^0-9]', '')

In [None]:
# Convert to numeric values
SUV['Capacity'] = pd.to_numeric(SUV['Capacity'], errors='coerce')

# Remove rows with NaN values for 'Capacity' column
SUV.dropna(subset=['Capacity'], inplace=True)
SUV['Capacity'] = SUV['Capacity'].astype('int64')

In [None]:
#Remove records less than 990 and more than 6000 from capacity column

SUV=SUV[(SUV['Capacity'] >= 990) & (SUV['Capacity'] <= 6000)]

In [None]:
# Replace non numeric characters in 'Mileage' column

SUV['Mileage'] =SUV['Mileage'].str.replace('km','')
SUV['Mileage'] =SUV['Mileage'].str.replace(',','')

In [None]:
# Convert to numeric values
SUV['Mileage'] = pd.to_numeric(SUV['Mileage'], errors='coerce')

# Remove rows with NaN values for 'Mileage' column
SUV.dropna(subset=['Mileage'], inplace=True)

SUV['Mileage'] = SUV['Mileage'].astype('int64')

In [None]:
# Remove possible error values
values_to_remove = [12345,11111]

# Remove rows with specific values in 'Mileage' column
SUV = SUV[~SUV['Mileage'].isin(values_to_remove)]

# Remove rows with values greater than 500000 and less than 10000 in 'Mileage' column
SUV= SUV[SUV['Mileage'] <= 500000]
SUV = SUV[SUV['Mileage'] >= 10000]