In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [3]:
car_data=pd.read_csv('Cardetails.csv')

In [4]:
car_data.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,torque,seats
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,Individual,Manual,First Owner,23.4 kmpl,1248 CC,74 bhp,190Nm@ 2000rpm,5.0
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,Individual,Manual,Second Owner,21.14 kmpl,1498 CC,103.52 bhp,250Nm@ 1500-2500rpm,5.0
2,Honda City 2017-2020 EXi,2006,158000,140000,Petrol,Individual,Manual,Third Owner,17.7 kmpl,1497 CC,78 bhp,"12.7@ 2,700(kgm@ rpm)",5.0
3,Hyundai i20 Sportz Diesel,2010,225000,127000,Diesel,Individual,Manual,First Owner,23.0 kmpl,1396 CC,90 bhp,22.4 kgm at 1750-2750rpm,5.0
4,Maruti Swift VXI BSIII,2007,130000,120000,Petrol,Individual,Manual,First Owner,16.1 kmpl,1298 CC,88.2 bhp,"11.5@ 4,500(kgm@ rpm)",5.0


In [5]:
car_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8128 entries, 0 to 8127
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   name           8128 non-null   object 
 1   year           8128 non-null   int64  
 2   selling_price  8128 non-null   int64  
 3   km_driven      8128 non-null   int64  
 4   fuel           8128 non-null   object 
 5   seller_type    8128 non-null   object 
 6   transmission   8128 non-null   object 
 7   owner          8128 non-null   object 
 8   mileage        7907 non-null   object 
 9   engine         7907 non-null   object 
 10  max_power      7913 non-null   object 
 11  torque         7906 non-null   object 
 12  seats          7907 non-null   float64
dtypes: float64(1), int64(3), object(9)
memory usage: 825.6+ KB


In [6]:
car_data.drop(columns=['torque'],inplace= True)

In [7]:
car_data.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,seats
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,Individual,Manual,First Owner,23.4 kmpl,1248 CC,74 bhp,5.0
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,Individual,Manual,Second Owner,21.14 kmpl,1498 CC,103.52 bhp,5.0
2,Honda City 2017-2020 EXi,2006,158000,140000,Petrol,Individual,Manual,Third Owner,17.7 kmpl,1497 CC,78 bhp,5.0
3,Hyundai i20 Sportz Diesel,2010,225000,127000,Diesel,Individual,Manual,First Owner,23.0 kmpl,1396 CC,90 bhp,5.0
4,Maruti Swift VXI BSIII,2007,130000,120000,Petrol,Individual,Manual,First Owner,16.1 kmpl,1298 CC,88.2 bhp,5.0


In [8]:
car_data.shape

(8128, 12)

### Data Preprocessing

#### Null check

In [9]:
car_data.isna().sum()

name               0
year               0
selling_price      0
km_driven          0
fuel               0
seller_type        0
transmission       0
owner              0
mileage          221
engine           221
max_power        215
seats            221
dtype: int64

#### Drop Null Records

In [10]:
car_data.dropna(inplace=True)

In [11]:
car_data.shape

(7907, 12)

#### Duplicate Check

In [12]:
car_data.duplicated().sum() #get number of duplicate records

1189

In [13]:
car_data.drop_duplicates(inplace=True) #drop duplicate records

In [14]:
car_data.shape

(6718, 12)

In [15]:
car_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6718 entries, 0 to 8125
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   name           6718 non-null   object 
 1   year           6718 non-null   int64  
 2   selling_price  6718 non-null   int64  
 3   km_driven      6718 non-null   int64  
 4   fuel           6718 non-null   object 
 5   seller_type    6718 non-null   object 
 6   transmission   6718 non-null   object 
 7   owner          6718 non-null   object 
 8   mileage        6718 non-null   object 
 9   engine         6718 non-null   object 
 10  max_power      6718 non-null   object 
 11  seats          6718 non-null   float64
dtypes: float64(1), int64(3), object(8)
memory usage: 682.3+ KB


#### Data Analysis

In [16]:
for col in car_data.columns:
    print("unique values of "+col)
    print(car_data[col].unique)
    print("===========================\n")

unique values of name
<bound method Series.unique of 0                  Maruti Swift Dzire VDI
1            Skoda Rapid 1.5 TDI Ambition
2                Honda City 2017-2020 EXi
3               Hyundai i20 Sportz Diesel
4                  Maruti Swift VXI BSIII
                      ...                
8121    Maruti Wagon R VXI BS IV with ABS
8122           Hyundai i20 Magna 1.4 CRDi
8123                    Hyundai i20 Magna
8124                Hyundai Verna CRDi SX
8125               Maruti Swift Dzire ZDi
Name: name, Length: 6718, dtype: object>

unique values of year
<bound method Series.unique of 0       2014
1       2014
2       2006
3       2010
4       2007
        ... 
8121    2013
8122    2014
8123    2013
8124    2007
8125    2009
Name: year, Length: 6718, dtype: int64>

unique values of selling_price
<bound method Series.unique of 0       450000
1       370000
2       158000
3       225000
4       130000
         ...  
8121    260000
8122    475000
8123    320000
8124    1

In [17]:
def get_brand_name(car_name):
    car_name=car_name.split(" ")[0]
    return car_name.strip(" ")

In [18]:
get_brand_name("Maruti Swift  Dzire  VDI")

'Maruti'

In [19]:
car_data['name']=car_data['name'].apply(get_brand_name)

In [20]:
car_data['name'].unique()

array(['Maruti', 'Skoda', 'Honda', 'Hyundai', 'Toyota', 'Ford', 'Renault',
       'Mahindra', 'Tata', 'Chevrolet', 'Datsun', 'Jeep', 'Mercedes-Benz',
       'Mitsubishi', 'Audi', 'Volkswagen', 'BMW', 'Nissan', 'Lexus',
       'Jaguar', 'Land', 'MG', 'Volvo', 'Daewoo', 'Kia', 'Fiat', 'Force',
       'Ambassador', 'Ashok', 'Isuzu', 'Opel'], dtype=object)

In [21]:
car_data['mileage']=car_data['mileage'].apply(get_brand_name)

In [22]:
car_data['max_power']=car_data['max_power'].apply(get_brand_name)

In [23]:
car_data['engine']=car_data['engine'].apply(get_brand_name)

In [24]:
for col in car_data.columns:
    print("unique values of "+col)
    print(car_data[col].unique)
    print("===========================\n")

unique values of name
<bound method Series.unique of 0        Maruti
1         Skoda
2         Honda
3       Hyundai
4        Maruti
         ...   
8121     Maruti
8122    Hyundai
8123    Hyundai
8124    Hyundai
8125     Maruti
Name: name, Length: 6718, dtype: object>

unique values of year
<bound method Series.unique of 0       2014
1       2014
2       2006
3       2010
4       2007
        ... 
8121    2013
8122    2014
8123    2013
8124    2007
8125    2009
Name: year, Length: 6718, dtype: int64>

unique values of selling_price
<bound method Series.unique of 0       450000
1       370000
2       158000
3       225000
4       130000
         ...  
8121    260000
8122    475000
8123    320000
8124    135000
8125    382000
Name: selling_price, Length: 6718, dtype: int64>

unique values of km_driven
<bound method Series.unique of 0       145500
1       120000
2       140000
3       127000
4       120000
         ...  
8121     50000
8122     80000
8123    110000
8124    119000
8125   

In [25]:
car_data['name'].replace(['Maruti', 'Skoda', 'Honda', 'Hyundai', 'Toyota', 'Ford', 'Renault',
       'Mahindra', 'Tata', 'Chevrolet', 'Datsun', 'Jeep', 'Mercedes-Benz',
       'Mitsubishi', 'Audi', 'Volkswagen', 'BMW', 'Nissan', 'Lexus',
       'Jaguar', 'Land', 'MG', 'Volvo', 'Daewoo', 'Kia', 'Fiat', 'Force',
       'Ambassador', 'Ashok', 'Isuzu', 'Opel'],[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31], inplace= True)

In [26]:
car_data['transmission'].unique()

array(['Manual', 'Automatic'], dtype=object)

In [27]:
car_data['transmission'].replace(['Manual','Automatic'],[1,2],inplace=True)

In [28]:
car_data['owner']

0                First Owner
1               Second Owner
2                Third Owner
3                First Owner
4                First Owner
                ...         
8121            Second Owner
8122            Second Owner
8123             First Owner
8124    Fourth & Above Owner
8125             First Owner
Name: owner, Length: 6718, dtype: object

In [29]:
car_data['seller_type'].unique()

array(['Individual', 'Dealer', 'Trustmark Dealer'], dtype=object)

In [30]:
car_data['seller_type'].replace(['Individual', 'Dealer', 'Trustmark Dealer'],[1,2,3],inplace=True)

In [31]:
car_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6718 entries, 0 to 8125
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   name           6718 non-null   int64  
 1   year           6718 non-null   int64  
 2   selling_price  6718 non-null   int64  
 3   km_driven      6718 non-null   int64  
 4   fuel           6718 non-null   object 
 5   seller_type    6718 non-null   int64  
 6   transmission   6718 non-null   int64  
 7   owner          6718 non-null   object 
 8   mileage        6718 non-null   object 
 9   engine         6718 non-null   object 
 10  max_power      6718 non-null   object 
 11  seats          6718 non-null   float64
dtypes: float64(1), int64(6), object(5)
memory usage: 682.3+ KB


In [32]:
car_data['fuel'].unique()

array(['Diesel', 'Petrol', 'LPG', 'CNG'], dtype=object)

In [33]:
car_data['fuel'].replace(['Diesel', 'Petrol', 'LPG', 'CNG'],[1,2,3,4],inplace=True)

In [34]:
car_data['owner'].unique()

array(['First Owner', 'Second Owner', 'Third Owner',
       'Fourth & Above Owner', 'Test Drive Car'], dtype=object)

In [43]:
car_data['owner'].replace(['First Owner', 'Second Owner', 'Third Owner', 'Fourth & Above Owner','Test Drive Car'],[1,2,3,4,5],inplace=True)

In [35]:
car_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6718 entries, 0 to 8125
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   name           6718 non-null   int64  
 1   year           6718 non-null   int64  
 2   selling_price  6718 non-null   int64  
 3   km_driven      6718 non-null   int64  
 4   fuel           6718 non-null   int64  
 5   seller_type    6718 non-null   int64  
 6   transmission   6718 non-null   int64  
 7   owner          6718 non-null   object 
 8   mileage        6718 non-null   object 
 9   engine         6718 non-null   object 
 10  max_power      6718 non-null   object 
 11  seats          6718 non-null   float64
dtypes: float64(1), int64(7), object(4)
memory usage: 682.3+ KB


In [36]:
car_data.reset_index(inplace=True)

In [37]:
car_data['max_power'] = car_data['max_power'].str.extract('(\d+)').astype(float)

#### converting string to float max_power column

In [38]:
car_data['max_power'] = car_data['max_power'].astype(float)

In [None]:
car_data['owner'] = car_data['owner'].astype(float)

In [39]:
car_data


Unnamed: 0,index,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,seats
0,0,1,2014,450000,145500,1,1,1,First Owner,23.4,1248,74.0,5.0
1,1,2,2014,370000,120000,1,1,1,Second Owner,21.14,1498,103.0,5.0
2,2,3,2006,158000,140000,2,1,1,Third Owner,17.7,1497,78.0,5.0
3,3,4,2010,225000,127000,1,1,1,First Owner,23.0,1396,90.0,5.0
4,4,1,2007,130000,120000,2,1,1,First Owner,16.1,1298,88.0,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6713,8121,1,2013,260000,50000,2,1,1,Second Owner,18.9,998,67.0,5.0
6714,8122,4,2014,475000,80000,1,1,1,Second Owner,22.54,1396,88.0,5.0
6715,8123,4,2013,320000,110000,2,1,1,First Owner,18.5,1197,82.0,5.0
6716,8124,4,2007,135000,119000,1,1,1,Fourth & Above Owner,16.8,1493,110.0,5.0


In [40]:
car_data.drop(columns=['index'],inplace=True)

In [41]:
car_data.drop(columns=['level_0'],inplace=True)

KeyError: "['level_0'] not found in axis"

In [154]:
car_data

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,seats
0,1,2014,450000,145500,1,1,1,1,23.4,1248,74.0,5.0
1,2,2014,370000,120000,1,1,1,2,21.14,1498,103.0,5.0
2,3,2006,158000,140000,2,1,1,3,17.7,1497,78.0,5.0
3,4,2010,225000,127000,1,1,1,1,23.0,1396,90.0,5.0
4,1,2007,130000,120000,2,1,1,1,16.1,1298,88.0,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...
6713,1,2013,260000,50000,2,1,1,2,18.9,998,67.0,5.0
6714,4,2014,475000,80000,1,1,1,2,22.54,1396,88.0,5.0
6715,4,2013,320000,110000,2,1,1,1,18.5,1197,82.0,5.0
6716,4,2007,135000,119000,1,1,1,4,16.8,1493,110.0,5.0


In [156]:
input_data=car_data.drop(columns=['selling_price']) #drop selling price
output_data=car_data['selling_price'] #assigning selling price to output_data

In [165]:
x_train,x_shape,y_train,y_shape=train_test_split(input_data,output_data,test_size=0.2) #replaced test by shape ie x_shape

#### Model Creation

In [167]:
model=LinearRegression()

In [173]:
model.fit(x_train,y_train)

In [175]:
predict=model.predict(x_test)

In [177]:
 predict

array([256457.04916063, 588820.87317729, 171501.02901915, ...,
       464567.22223006, 230505.43246587, 416519.44706133])

In [181]:
x_train.head(1)

Unnamed: 0,name,year,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,seats
3369,4,2009,80000,2,1,1,2,17.92,1086,62.0,5.0


In [211]:
input_data_model=pd.DataFrame(
    [[4,2019,8000,2,1,1,2,17.92,1086,62.0,7.0]],
    columns=['name','year','km_driven','fuel','seller_type','transmission','owner','mileage','engine','max_power','seats'])

In [213]:
input_data_model

Unnamed: 0,name,year,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,seats
0,4,2019,8000,2,1,1,2,17.92,1086,62.0,7.0


In [215]:
model.predict(input_data_model) #testing/evaluationg the model

array([423220.74525474])

In [217]:
import pickle as pk

In [221]:
pk.dump(model,open('model.pkl','wb'))

In [42]:
car_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6718 entries, 0 to 6717
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   name           6718 non-null   int64  
 1   year           6718 non-null   int64  
 2   selling_price  6718 non-null   int64  
 3   km_driven      6718 non-null   int64  
 4   fuel           6718 non-null   int64  
 5   seller_type    6718 non-null   int64  
 6   transmission   6718 non-null   int64  
 7   owner          6718 non-null   object 
 8   mileage        6718 non-null   object 
 9   engine         6718 non-null   object 
 10  max_power      6717 non-null   float64
 11  seats          6718 non-null   float64
dtypes: float64(2), int64(7), object(3)
memory usage: 629.9+ KB
