In [66]:
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
from datasist.structdata import detect_outliers
from category_encoders import OneHotEncoder, BinaryEncoder
from sklearn.preprocessing import MinMaxScaler

In [2]:
df = pd.read_csv('train.csv')

In [3]:
df.head()

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,New_Price,Price
0,Maruti Wagon R LXI CNG,Mumbai,2010,72000,CNG,Manual,First,26.6 km/kg,998 CC,58.16 bhp,5.0,,1.75
1,Hyundai Creta 1.6 CRDi SX Option,Pune,2015,41000,Diesel,Manual,First,19.67 kmpl,1582 CC,126.2 bhp,5.0,,12.5
2,Honda Jazz V,Chennai,2011,46000,Petrol,Manual,First,18.2 kmpl,1199 CC,88.7 bhp,5.0,8.61 Lakh,4.5
3,Maruti Ertiga VDI,Chennai,2012,87000,Diesel,Manual,First,20.77 kmpl,1248 CC,88.76 bhp,7.0,,6.0
4,Audi A4 New 2.0 TDI Multitronic,Coimbatore,2013,40670,Diesel,Automatic,Second,15.2 kmpl,1968 CC,140.8 bhp,5.0,,17.74


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6019 entries, 0 to 6018
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Name               6019 non-null   object 
 1   Location           6019 non-null   object 
 2   Year               6019 non-null   int64  
 3   Kilometers_Driven  6019 non-null   int64  
 4   Fuel_Type          6019 non-null   object 
 5   Transmission       6019 non-null   object 
 6   Owner_Type         6019 non-null   object 
 7   Mileage            6017 non-null   object 
 8   Engine             5983 non-null   object 
 9   Power              5983 non-null   object 
 10  Seats              5977 non-null   float64
 11  New_Price          824 non-null    object 
 12  Price              6019 non-null   float64
dtypes: float64(2), int64(2), object(9)
memory usage: 611.4+ KB


In [5]:
df.isna().sum()

Name                    0
Location                0
Year                    0
Kilometers_Driven       0
Fuel_Type               0
Transmission            0
Owner_Type              0
Mileage                 2
Engine                 36
Power                  36
Seats                  42
New_Price            5195
Price                   0
dtype: int64

In [6]:
df.drop(columns = 'New_Price',inplace = True)

In [7]:
df.dropna(inplace = True)

In [8]:
df.isna().sum()

Name                 0
Location             0
Year                 0
Kilometers_Driven    0
Fuel_Type            0
Transmission         0
Owner_Type           0
Mileage              0
Engine               0
Power                0
Seats                0
Price                0
dtype: int64

In [9]:
df.head()

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price
0,Maruti Wagon R LXI CNG,Mumbai,2010,72000,CNG,Manual,First,26.6 km/kg,998 CC,58.16 bhp,5.0,1.75
1,Hyundai Creta 1.6 CRDi SX Option,Pune,2015,41000,Diesel,Manual,First,19.67 kmpl,1582 CC,126.2 bhp,5.0,12.5
2,Honda Jazz V,Chennai,2011,46000,Petrol,Manual,First,18.2 kmpl,1199 CC,88.7 bhp,5.0,4.5
3,Maruti Ertiga VDI,Chennai,2012,87000,Diesel,Manual,First,20.77 kmpl,1248 CC,88.76 bhp,7.0,6.0
4,Audi A4 New 2.0 TDI Multitronic,Coimbatore,2013,40670,Diesel,Automatic,Second,15.2 kmpl,1968 CC,140.8 bhp,5.0,17.74


### Numerical data

In [10]:
df.columns

Index(['Name', 'Location', 'Year', 'Kilometers_Driven', 'Fuel_Type',
       'Transmission', 'Owner_Type', 'Mileage', 'Engine', 'Power', 'Seats',
       'Price'],
      dtype='object')

In [11]:
numerical_data = df[['Year','Kilometers_Driven','Mileage','Engine','Power','Seats']].copy(deep = True)

In [12]:
numerical_data

Unnamed: 0,Year,Kilometers_Driven,Mileage,Engine,Power,Seats
0,2010,72000,26.6 km/kg,998 CC,58.16 bhp,5.0
1,2015,41000,19.67 kmpl,1582 CC,126.2 bhp,5.0
2,2011,46000,18.2 kmpl,1199 CC,88.7 bhp,5.0
3,2012,87000,20.77 kmpl,1248 CC,88.76 bhp,7.0
4,2013,40670,15.2 kmpl,1968 CC,140.8 bhp,5.0
...,...,...,...,...,...,...
6014,2014,27365,28.4 kmpl,1248 CC,74 bhp,5.0
6015,2015,100000,24.4 kmpl,1120 CC,71 bhp,5.0
6016,2012,55000,14.0 kmpl,2498 CC,112 bhp,8.0
6017,2013,46000,18.9 kmpl,998 CC,67.1 bhp,5.0


In [13]:
numerical_data.columns

Index(['Year', 'Kilometers_Driven', 'Mileage', 'Engine', 'Power', 'Seats'], dtype='object')

In [14]:
def handle_numirec(df):
    Mileage = numerical_data['Mileage'].str.split(expand = True)[0]
    engine = numerical_data['Engine'].str.split(expand = True)[0]
    power = numerical_data['Power'].str.split(expand = True)[0]
    return pd.DataFrame([Mileage,engine,power])

In [15]:
# numerical_data.apply(handle_numirec,axis = 1)

In [16]:
names = ['Mileage','Engine','Power']
for i in names:
    try:
        numerical_data[i] = numerical_data[i].str.split(expand = True)[0].astype(float)
    except:
        numerical_data[i] = numerical_data[i].str.split(expand = True)[0]
        

In [17]:
numerical_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5975 entries, 0 to 6018
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Year               5975 non-null   int64  
 1   Kilometers_Driven  5975 non-null   int64  
 2   Mileage            5975 non-null   float64
 3   Engine             5975 non-null   float64
 4   Power              5975 non-null   object 
 5   Seats              5975 non-null   float64
dtypes: float64(3), int64(2), object(1)
memory usage: 326.8+ KB


In [18]:
power_index = numerical_data['Power'][numerical_data.Power.str.isalpha()].index

In [19]:
numerical_data.drop(power_index,inplace = True)
numerical_data.reset_index(drop = True,inplace = True)

In [20]:
df.drop(power_index,inplace = True)
df.reset_index(drop = True,inplace = True)

In [21]:
df.drop(columns=names,inplace = True)

In [22]:
numerical_data['Power']= numerical_data.Power.astype(float)

In [23]:
numerical_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5872 entries, 0 to 5871
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Year               5872 non-null   int64  
 1   Kilometers_Driven  5872 non-null   int64  
 2   Mileage            5872 non-null   float64
 3   Engine             5872 non-null   float64
 4   Power              5872 non-null   float64
 5   Seats              5872 non-null   float64
dtypes: float64(4), int64(2)
memory usage: 275.4 KB


In [24]:
index_mileage = detect_outliers(numerical_data,n =0 ,features=['Mileage'])
numerical_data.drop(index_mileage,inplace = True)

In [25]:
numerical_data.reset_index(drop = True,inplace = True)

In [26]:
df.drop(index_mileage,inplace = True)
df.reset_index(drop = True,inplace = True)

In [27]:
df['Name'].str.split(' ',expand= True)[0]

0          Maruti
1         Hyundai
2           Honda
3          Maruti
4            Audi
          ...    
5824       Maruti
5825      Hyundai
5826     Mahindra
5827       Maruti
5828    Chevrolet
Name: 0, Length: 5829, dtype: object

In [28]:
df['Name']= df['Name'].apply(lambda x:x.split(' ')[0].title())


In [29]:
df['Name'].unique()

array(['Maruti', 'Hyundai', 'Honda', 'Audi', 'Nissan', 'Toyota',
       'Volkswagen', 'Tata', 'Land', 'Mitsubishi', 'Renault',
       'Mercedes-Benz', 'Bmw', 'Mahindra', 'Ford', 'Porsche', 'Datsun',
       'Jaguar', 'Volvo', 'Chevrolet', 'Skoda', 'Mini', 'Fiat', 'Jeep',
       'Ambassador', 'Isuzu', 'Force', 'Bentley'], dtype=object)

In [30]:
cat_df = df.select_dtypes('object')

In [31]:
bin_enc = BinaryEncoder()
name_trans =bin_enc.fit_transform(df['Name'])

In [32]:
numerical_data

Unnamed: 0,Year,Kilometers_Driven,Mileage,Engine,Power,Seats
0,2010,72000,26.60,998.0,58.16,5.0
1,2015,41000,19.67,1582.0,126.20,5.0
2,2011,46000,18.20,1199.0,88.70,5.0
3,2012,87000,20.77,1248.0,88.76,7.0
4,2013,40670,15.20,1968.0,140.80,5.0
...,...,...,...,...,...,...
5824,2014,27365,28.40,1248.0,74.00,5.0
5825,2015,100000,24.40,1120.0,71.00,5.0
5826,2012,55000,14.00,2498.0,112.00,8.0
5827,2013,46000,18.90,998.0,67.10,5.0


In [33]:
df = pd.concat([df,numerical_data],axis = 1)

In [34]:
df.drop(columns=['Year','Kilometers_Driven','Seats'],inplace = True)

In [35]:
df['Fuel_Type'].unique()

array(['CNG', 'Diesel', 'Petrol', 'LPG'], dtype=object)

In [36]:
index_feul= df[(df['Fuel_Type'] == 'CNG') | (df['Fuel_Type'] == 'LPG')].index

In [37]:
df.drop(index_feul,inplace = True)
df.reset_index(drop = True,inplace = True)

In [38]:
df.head(1)

Unnamed: 0,Name,Location,Fuel_Type,Transmission,Owner_Type,Price,Mileage,Engine,Power
0,Hyundai,Pune,Diesel,Manual,First,12.5,19.67,1582.0,126.2


In [39]:
df['Owner_Type'].value_counts()

First             4765
Second             906
Third              100
Fourth & Above       7
Name: Owner_Type, dtype: int64

In [40]:
index_owner= df[df['Owner_Type'] == 'Fourth & Above'].index
df.drop(index_owner,inplace = True)
df.reset_index(drop = True,inplace = True)

In [41]:
Owner_Type_ordinal = {'First': 3,'Second': 2, 'Third': 1}
df['Owner_Type']= df['Owner_Type'].map(Owner_Type_ordinal)

In [42]:
Transmission_ordinal = {'Automatic': 2, 'Manual': 1}
df['Transmission']= df['Transmission'].map(Transmission_ordinal)

In [43]:
df['Fuel_Type'].value_counts()

Diesel    3130
Petrol    2641
Name: Fuel_Type, dtype: int64

In [44]:
fuel_ordinal = {'Diesel': 2, 'Petrol': 1}
df['Fuel_Type']= df['Fuel_Type'].map(fuel_ordinal)

In [45]:
df.isna().sum()

Name            0
Location        0
Fuel_Type       0
Transmission    0
Owner_Type      0
Price           0
Mileage         0
Engine          0
Power           0
dtype: int64

In [46]:
# df.drop(columns='Location',inplace = True)

In [47]:
bin_enc = BinaryEncoder()
name_trans =bin_enc.fit_transform(df['Name'])

In [48]:
ohe = OneHotEncoder()
ohe.fit(df['Location'])
loc_fuel = ohe.transform(df['Location'])
loc_fuel

Unnamed: 0,Location_1,Location_2,Location_3,Location_4,Location_5,Location_6,Location_7,Location_8,Location_9,Location_10,Location_11
0,1,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,0
3,0,0,1,0,0,0,0,0,0,0,0
4,0,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
5766,0,0,0,0,0,0,0,1,0,0,0
5767,0,0,0,1,0,0,0,0,0,0,0
5768,0,0,0,1,0,0,0,0,0,0,0
5769,0,0,0,0,0,0,1,0,0,0,0


In [49]:
final_df= pd.concat([name_trans,df,loc_fuel],axis=1)

In [50]:
ohe = OneHotEncoder()
ohe.fit(df['Location'])
loc_fuel = ohe.transform(df['Location'])
loc_fuel

Unnamed: 0,Location_1,Location_2,Location_3,Location_4,Location_5,Location_6,Location_7,Location_8,Location_9,Location_10,Location_11
0,1,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,0
3,0,0,1,0,0,0,0,0,0,0,0
4,0,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
5766,0,0,0,0,0,0,0,1,0,0,0
5767,0,0,0,1,0,0,0,0,0,0,0
5768,0,0,0,1,0,0,0,0,0,0,0
5769,0,0,0,0,0,0,1,0,0,0,0


In [51]:
final_df.head()

Unnamed: 0,Name_0,Name_1,Name_2,Name_3,Name_4,Name,Location,Fuel_Type,Transmission,Owner_Type,...,Location_2,Location_3,Location_4,Location_5,Location_6,Location_7,Location_8,Location_9,Location_10,Location_11
0,0,0,0,0,1,Hyundai,Pune,2,1,3,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,1,0,Honda,Chennai,1,1,3,...,1,0,0,0,0,0,0,0,0,0
2,0,0,0,1,1,Maruti,Chennai,2,1,3,...,1,0,0,0,0,0,0,0,0,0
3,0,0,1,0,0,Audi,Coimbatore,2,2,2,...,0,1,0,0,0,0,0,0,0,0
4,0,0,1,0,1,Nissan,Jaipur,2,1,3,...,0,0,1,0,0,0,0,0,0,0


In [52]:
final_df.drop(columns=['Name','Location'],inplace= True)

In [53]:
numerical_data

Unnamed: 0,Year,Kilometers_Driven,Mileage,Engine,Power,Seats
0,2010,72000,26.60,998.0,58.16,5.0
1,2015,41000,19.67,1582.0,126.20,5.0
2,2011,46000,18.20,1199.0,88.70,5.0
3,2012,87000,20.77,1248.0,88.76,7.0
4,2013,40670,15.20,1968.0,140.80,5.0
...,...,...,...,...,...,...
5824,2014,27365,28.40,1248.0,74.00,5.0
5825,2015,100000,24.40,1120.0,71.00,5.0
5826,2012,55000,14.00,2498.0,112.00,8.0
5827,2013,46000,18.90,998.0,67.10,5.0


In [56]:
final_df.isna().sum()

Name_0          0
Name_1          0
Name_2          0
Name_3          0
Name_4          0
Fuel_Type       0
Transmission    0
Owner_Type      0
Price           0
Mileage         0
Engine          0
Power           0
Location_1      0
Location_2      0
Location_3      0
Location_4      0
Location_5      0
Location_6      0
Location_7      0
Location_8      0
Location_9      0
Location_10     0
Location_11     0
dtype: int64

In [65]:
y = final_df['Price']
X = final_df.drop(columns = 'Price')

In [63]:
# ohe = OneHotEncoder()
# ohe.fit(final_df[['Location','Fuel_Type']])
# loc_fuel = ohe.transform(final_df[['Location','Fuel_Type']])