# Pak Garage Notebook
#### This is a Notebook to develop and test the ML model that we will use for production in our application!

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
df = pd.read_csv('dataset/vehicledata.csv')

### Our Initial Dataframe

In [3]:
df.head(5)

Unnamed: 0,Ad No,Name,Price,Model Year,Location,Mileage,Registered City,Engine Type,Engine Capacity,Transmission,Color,Assembly,Body Type,Features,Last Updated,URL
0,4096758,Toyota Vitz F 1.0 2017,2385000.0,2017,"G- 8, Islamabad Islamabad",9869,Un-Registered,Petrol,1000 cc,Automatic,Silver,Imported,Hatchback,"ABS, AM/FM Radio, Air Bags, Air Conditioning,...","Jul 11, 2020",https://www.pakwheels.com/used-cars/toyota-vit...
1,4168305,Toyota Corolla GLi Automatic 1.3 VVTi 2019,111000.0,2019,Peshawar KPK,11111,Islamabad,Petrol,1300 cc,Automatic,White,Local,Sedan,"ABS, AM/FM Radio, Air Bags, Air Conditioning,...","Jul 12, 2020",https://www.pakwheels.com/used-cars/toyota-cor...
2,4168298,Suzuki Alto VXL 2019,1530000.0,2019,"Akora Khattak, Nowshera KPK",17500,Un-Registered,Petrol,660 cc,Automatic,White,Local,Hatchback,"ABS, AM/FM Radio, Air Bags, Air Conditioning,...","Jul 12, 2020",https://www.pakwheels.com/used-cars/suzuki-alt...
3,4168307,Suzuki Alto VXR 2019,1650000.0,2019,"Abdullahpur, Faisalabad Punjab",9600,Lahore,Petrol,660 cc,Manual,White,Local,Hatchback,"AM/FM Radio, Air Bags, Air Conditioning, DVD ...","Jul 12, 2020",https://www.pakwheels.com/used-cars/suzuki-alt...
4,4168306,Toyota Corolla XLi VVTi 2010,1435000.0,2010,"9th Avenue, Islamabad Islamabad",120000,Islamabad,Petrol,1300 cc,Manual,Black,Local,Sedan,"AM/FM Radio, Air Conditioning, CD Player, Key...","Jul 12, 2020",https://www.pakwheels.com/used-cars/toyota-cor...


### Name Of The Car
Since the company of the vehicle heavily affects the price we will require it to make accurate predictions. However we only need the company and not the rest of the name of the car so we will make it into a seperate column.

In [4]:
df['Make'] = df['Name'].str.split(' ').str[0]

In [5]:
df = df.drop(['Name'], axis = 1)
df.head(2)

Unnamed: 0,Ad No,Price,Model Year,Location,Mileage,Registered City,Engine Type,Engine Capacity,Transmission,Color,Assembly,Body Type,Features,Last Updated,URL,Make
0,4096758,2385000.0,2017,"G- 8, Islamabad Islamabad",9869,Un-Registered,Petrol,1000 cc,Automatic,Silver,Imported,Hatchback,"ABS, AM/FM Radio, Air Bags, Air Conditioning,...","Jul 11, 2020",https://www.pakwheels.com/used-cars/toyota-vit...,Toyota
1,4168305,111000.0,2019,Peshawar KPK,11111,Islamabad,Petrol,1300 cc,Automatic,White,Local,Sedan,"ABS, AM/FM Radio, Air Bags, Air Conditioning,...","Jul 12, 2020",https://www.pakwheels.com/used-cars/toyota-cor...,Toyota


### Ad No
The Ad number of the vehicle has zero affect on the price hence we remove this column.

In [6]:
df = df.drop(['Ad No'], axis = 1)
df.head(2)

Unnamed: 0,Price,Model Year,Location,Mileage,Registered City,Engine Type,Engine Capacity,Transmission,Color,Assembly,Body Type,Features,Last Updated,URL,Make
0,2385000.0,2017,"G- 8, Islamabad Islamabad",9869,Un-Registered,Petrol,1000 cc,Automatic,Silver,Imported,Hatchback,"ABS, AM/FM Radio, Air Bags, Air Conditioning,...","Jul 11, 2020",https://www.pakwheels.com/used-cars/toyota-vit...,Toyota
1,111000.0,2019,Peshawar KPK,11111,Islamabad,Petrol,1300 cc,Automatic,White,Local,Sedan,"ABS, AM/FM Radio, Air Bags, Air Conditioning,...","Jul 12, 2020",https://www.pakwheels.com/used-cars/toyota-cor...,Toyota


### Location
We dont need the location column as it only tells us the location from where the ad poster hence is irrelavant so we will discard it. Instead the Registered City column is more beneficial than this.

In [7]:
df = df.drop(['Location'], axis = 1)
df.head(2)

Unnamed: 0,Price,Model Year,Mileage,Registered City,Engine Type,Engine Capacity,Transmission,Color,Assembly,Body Type,Features,Last Updated,URL,Make
0,2385000.0,2017,9869,Un-Registered,Petrol,1000 cc,Automatic,Silver,Imported,Hatchback,"ABS, AM/FM Radio, Air Bags, Air Conditioning,...","Jul 11, 2020",https://www.pakwheels.com/used-cars/toyota-vit...,Toyota
1,111000.0,2019,11111,Islamabad,Petrol,1300 cc,Automatic,White,Local,Sedan,"ABS, AM/FM Radio, Air Bags, Air Conditioning,...","Jul 12, 2020",https://www.pakwheels.com/used-cars/toyota-cor...,Toyota


### URL
The URL column tells us the URL of the posted add and is irrelavant hence discard it.

In [8]:
df = df.drop(['URL'], axis = 1)
df.head(2)

Unnamed: 0,Price,Model Year,Mileage,Registered City,Engine Type,Engine Capacity,Transmission,Color,Assembly,Body Type,Features,Last Updated,Make
0,2385000.0,2017,9869,Un-Registered,Petrol,1000 cc,Automatic,Silver,Imported,Hatchback,"ABS, AM/FM Radio, Air Bags, Air Conditioning,...","Jul 11, 2020",Toyota
1,111000.0,2019,11111,Islamabad,Petrol,1300 cc,Automatic,White,Local,Sedan,"ABS, AM/FM Radio, Air Bags, Air Conditioning,...","Jul 12, 2020",Toyota


### Last Updated
This column defines when was the ad last updated with information and does not affect the price so discard it.

In [9]:
df = df.drop(['Last Updated'], axis = 1)
df.head(2)

Unnamed: 0,Price,Model Year,Mileage,Registered City,Engine Type,Engine Capacity,Transmission,Color,Assembly,Body Type,Features,Make
0,2385000.0,2017,9869,Un-Registered,Petrol,1000 cc,Automatic,Silver,Imported,Hatchback,"ABS, AM/FM Radio, Air Bags, Air Conditioning,...",Toyota
1,111000.0,2019,11111,Islamabad,Petrol,1300 cc,Automatic,White,Local,Sedan,"ABS, AM/FM Radio, Air Bags, Air Conditioning,...",Toyota


### Features
This column tells us about the things equipped with the vehicle and has minimal affect on the price of the vehicle hence we will discard it.

In [13]:
df = df.drop(['Features'], axis = 1)
df.head(2)

Unnamed: 0,Price,Model Year,Mileage,Registered City,Engine Type,Engine Capacity,Transmission,Color,Assembly,Body Type,Make
0,2385000.0,2017,9869,Un-Registered,Petrol,1000 cc,Automatic,Silver,Imported,Hatchback,Toyota
1,111000.0,2019,11111,Islamabad,Petrol,1300 cc,Automatic,White,Local,Sedan,Toyota
