### Importing Necessary libraries and loading the dataset into a dataframe named 'vehicles_df'

In [60]:
import pandas as pd
vehicles = pd.read_csv('Vehicles.csv')
vehicles_df = pd.DataFrame(vehicles)

### Inspecting the data wsing .head(), .info() etc

In [61]:
print(vehicles_df.head())

                       name  year  selling_price  km_driven    fuel  \
0             Maruti 800 AC  2007          60000      70000  Petrol   
1  Maruti Wagon R LXI Minor  2007         135000      50000  Petrol   
2      Hyundai Verna 1.6 SX  2012         600000     100000  Diesel   
3    Datsun RediGO T Option  2017         250000      46000  Petrol   
4     Honda Amaze VX i-DTEC  2014         450000     141000  Diesel   

  seller_type transmission         owner  
0  Individual       Manual   First Owner  
1  Individual       Manual   First Owner  
2  Individual       Manual   First Owner  
3  Individual       Manual   First Owner  
4  Individual       Manual  Second Owner  


In [62]:
print(vehicles_df.dtypes)

name             object
year              int64
selling_price     int64
km_driven         int64
fuel             object
seller_type      object
transmission     object
owner            object
dtype: object


In [63]:
print(vehicles_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4340 entries, 0 to 4339
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   name           4340 non-null   object
 1   year           4340 non-null   int64 
 2   selling_price  4340 non-null   int64 
 3   km_driven      4340 non-null   int64 
 4   fuel           4340 non-null   object
 5   seller_type    4340 non-null   object
 6   transmission   4340 non-null   object
 7   owner          4340 non-null   object
dtypes: int64(3), object(5)
memory usage: 271.4+ KB
None


### Exploring the data

In [64]:
rows, columns = vehicles_df.shape
print("Rows: ", rows, "Columns: ", columns)

Rows:  4340 Columns:  8


In [65]:
unique_owners = vehicles_df['owner'].unique()
print('Unique values in the "owner" column: ', unique_owners)

Unique values in the "owner" column:  ['First Owner' 'Second Owner' 'Fourth & Above Owner' 'Third Owner'
 'Test Drive Car']


In [66]:
price_mean = vehicles_df['selling_price'].mean()
price_max = vehicles_df['selling_price'].max()
price_min = vehicles_df['selling_price'].min()
print(f"Mean of the Selling price: {price_mean} \nMaximum Selling price: {price_max} \nMinimum Selling price: {price_min}")

Mean of the Selling price: 504127.3117511521 
Maximum Selling price: 8900000 
Minimum Selling price: 20000


### Filtering the Data

##### Vehicles Data on the year 2007

In [67]:
year_2007 = vehicles_df[vehicles_df['year'] == 2007]
print(year_2007.head())

                        name  year  selling_price  km_driven    fuel  \
0              Maruti 800 AC  2007          60000      70000  Petrol   
1   Maruti Wagon R LXI Minor  2007         135000      50000  Petrol   
5       Maruti Alto LX BSIII  2007         140000     125000  Petrol   
13             Maruti 800 AC  2007          60000      70000  Petrol   
14  Maruti Wagon R LXI Minor  2007         135000      50000  Petrol   

   seller_type transmission        owner  
0   Individual       Manual  First Owner  
1   Individual       Manual  First Owner  
5   Individual       Manual  First Owner  
13  Individual       Manual  First Owner  
14  Individual       Manual  First Owner  


##### Vehicles Data where year is 2007 and selling price is greater than 5000000

In [68]:
yr_07_sp_50l = vehicles_df[(vehicles_df['year'] == 2007) & (vehicles_df['selling_price'] >= 500000)]
print("Data on the year 2007 and where the selling price is above 50 lakhs: \n", yr_07_sp_50l)

Data on the year 2007 and where the selling price is above 50 lakhs: 
                                            name  year  selling_price  \
1813  Mercedes-Benz New C-Class 200 CDI Classic  2007         699000   
2511                  Mitsubishi Montero 3.2 MT  2007         750000   
2983                        Tata New Safari 4X2  2007         550000   
3612              Mercedes-Benz E-Class 280 CDI  2007         900000   
4005                        Tata New Safari 4X2  2007         550000   

      km_driven    fuel seller_type transmission         owner  
1813     101849  Diesel      Dealer       Manual  Second Owner  
2511     180000  Diesel  Individual       Manual   First Owner  
2983      80000  Petrol  Individual       Manual  Second Owner  
3612      76731  Diesel      Dealer    Automatic   First Owner  
4005      80000  Petrol  Individual       Manual  Second Owner  


##### Data Manipulation

###### Using .rename() and renaming km_driven to kilometers_driven

In [69]:
vehicles_df.rename(columns= {'km_driven' : 'kilometers_driven'}, inplace = True)
print(vehicles_df.head())

                       name  year  selling_price  kilometers_driven    fuel  \
0             Maruti 800 AC  2007          60000              70000  Petrol   
1  Maruti Wagon R LXI Minor  2007         135000              50000  Petrol   
2      Hyundai Verna 1.6 SX  2012         600000             100000  Diesel   
3    Datsun RediGO T Option  2017         250000              46000  Petrol   
4     Honda Amaze VX i-DTEC  2014         450000             141000  Diesel   

  seller_type transmission         owner  
0  Individual       Manual   First Owner  
1  Individual       Manual   First Owner  
2  Individual       Manual   First Owner  
3  Individual       Manual   First Owner  
4  Individual       Manual  Second Owner  


##### Sorting the data in ascending order in terms of kilometers_driven

In [70]:
print(vehicles_df.sort_values('kilometers_driven'))

                                             name  year  selling_price  \
1312                           Mahindra Quanto C6  2014         250000   
1714               Ford Freestyle Titanium Diesel  2020         784000   
1716            Ford Ecosport 1.5 Diesel Titanium  2020        1000000   
1715                           Ford Figo Titanium  2020         635000   
2236               Renault Duster 85PS Diesel RxL  2013         450000   
...                                           ...   ...            ...   
2394          Toyota Innova 2.5 V Diesel 8-seater  2009         350000   
3679  Toyota Innova 2.5 G (Diesel) 7 Seater BS IV  2006         400000   
525             Maruti SX4 S Cross DDiS 320 Delta  2016         665000   
4184            Maruti SX4 S Cross DDiS 320 Delta  2016         665000   
1243                       Maruti Swift VXI BSIII  2009         250000   

      kilometers_driven    fuel seller_type transmission           owner  
1312                  1  Diesel  Ind

##### Changing the object data types by using pandas (OHE) and ordinal encoding

In [78]:
from sklearn.preprocessing import OrdinalEncoder
encoder = OrdinalEncoder(categories = [['Petrol','Diesel','Electric','LPG','CNG']])
vehicles_df['fuel_encoded'] = encoder.fit_transform(vehicles_df[['fuel']])
enc = OrdinalEncoder(categories=[['Dealer','Individual','Trustmark Dealer']])
vehicles_df['sellertype_encoded'] = enc.fit_transform(vehicles_df[['seller_type']])
enco = OrdinalEncoder(categories=[['First Owner','Second Owner','Third Owner','Fourth & Above Owner', 'Test Drive Car']])
vehicles_df['owner_encoded'] = enco.fit_transform(vehicles_df[['owner']])
vehicles_df.head()

Unnamed: 0,name,year,selling_price,kilometers_driven,fuel,seller_type,transmission,owner,fuel_encoded,sellertype_encoded,owner_encoded
0,Maruti 800 AC,2007,60000,70000,Petrol,Individual,Manual,First Owner,0.0,1.0,0.0
1,Maruti Wagon R LXI Minor,2007,135000,50000,Petrol,Individual,Manual,First Owner,0.0,1.0,0.0
2,Hyundai Verna 1.6 SX,2012,600000,100000,Diesel,Individual,Manual,First Owner,1.0,1.0,0.0
3,Datsun RediGO T Option,2017,250000,46000,Petrol,Individual,Manual,First Owner,0.0,1.0,0.0
4,Honda Amaze VX i-DTEC,2014,450000,141000,Diesel,Individual,Manual,Second Owner,1.0,1.0,1.0


In [79]:
categorical_col = pd.get_dummies(vehicles_df, columns = ['transmission'], drop_first=False)
categorical_col

Unnamed: 0,name,year,selling_price,kilometers_driven,fuel,seller_type,owner,fuel_encoded,sellertype_encoded,owner_encoded,transmission_Automatic,transmission_Manual
0,Maruti 800 AC,2007,60000,70000,Petrol,Individual,First Owner,0.0,1.0,0.0,False,True
1,Maruti Wagon R LXI Minor,2007,135000,50000,Petrol,Individual,First Owner,0.0,1.0,0.0,False,True
2,Hyundai Verna 1.6 SX,2012,600000,100000,Diesel,Individual,First Owner,1.0,1.0,0.0,False,True
3,Datsun RediGO T Option,2017,250000,46000,Petrol,Individual,First Owner,0.0,1.0,0.0,False,True
4,Honda Amaze VX i-DTEC,2014,450000,141000,Diesel,Individual,Second Owner,1.0,1.0,1.0,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...
4335,Hyundai i20 Magna 1.4 CRDi (Diesel),2014,409999,80000,Diesel,Individual,Second Owner,1.0,1.0,1.0,False,True
4336,Hyundai i20 Magna 1.4 CRDi,2014,409999,80000,Diesel,Individual,Second Owner,1.0,1.0,1.0,False,True
4337,Maruti 800 AC BSIII,2009,110000,83000,Petrol,Individual,Second Owner,0.0,1.0,1.0,False,True
4338,Hyundai Creta 1.6 CRDi SX Option,2016,865000,90000,Diesel,Individual,First Owner,1.0,1.0,0.0,False,True


In [80]:
from sklearn.model_selection import train_test_split
x = categorical_col.drop(['name','selling_price','fuel','seller_type','owner'],axis=1)
y = vehicles_df.selling_price
train_x,val_x, train_y, val_y = train_test_split(x, y, random_state = 1)

In [81]:
val_x

Unnamed: 0,year,kilometers_driven,fuel_encoded,sellertype_encoded,owner_encoded,transmission_Automatic,transmission_Manual
2761,2019,15000,1.0,1.0,0.0,False,True
3210,2014,70000,1.0,1.0,1.0,False,True
2606,2012,90000,1.0,1.0,0.0,False,True
1030,2016,41000,0.0,0.0,0.0,False,True
3942,2014,71318,1.0,0.0,0.0,False,True
...,...,...,...,...,...,...,...
4086,2008,120000,1.0,1.0,1.0,False,True
297,2015,70000,0.0,1.0,0.0,False,True
527,2017,41000,1.0,1.0,0.0,True,False
442,2011,15000,0.0,1.0,0.0,False,True


##### Importing Linear regression model and working with it

In [82]:
from sklearn.linear_model import LinearRegression

In [83]:
model = LinearRegression()
model.fit(train_x,train_y)
pred = model.predict(val_x)
print(pred)

[ 752118.43349756  501985.93917777  429013.34178609 ... 1529721.58872584
  264558.92235866  544811.8706113 ]


##### R2 score of the model

In [84]:
from sklearn.metrics import r2_score
r2 = r2_score(val_y, pred)
print(r2)

0.43322731780192736
