***
-------
# <span style='color:Purple'> Car Price Regression model with Scikit-Learn </span>

***
-------


## 1.Dataset

In [77]:
#importing libraris
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
import warnings 
warnings.filterwarnings('ignore')

In [2]:
plt.style.use('seaborn')

In [3]:
#importing dataset
Data = pd.read_csv("cardata.csv")
#converting data to a pandas dataframe
df = pd.DataFrame(Data)
#calling dataframe(df)
df

Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
0,ritz,2014,3.35,5.59,27000,Petrol,Dealer,Manual,0
1,sx4,2013,4.75,9.54,43000,Diesel,Dealer,Manual,0
2,ciaz,2017,7.25,9.85,6900,Petrol,Dealer,Manual,0
3,wagon r,2011,2.85,4.15,5200,Petrol,Dealer,Manual,0
4,swift,2014,4.60,6.87,42450,Diesel,Dealer,Manual,0
...,...,...,...,...,...,...,...,...,...
296,city,2016,9.50,11.60,33988,Diesel,Dealer,Manual,0
297,brio,2015,4.00,5.90,60000,Petrol,Dealer,Manual,0
298,city,2009,3.35,11.00,87934,Petrol,Dealer,Manual,0
299,city,2017,11.50,12.50,9000,Diesel,Dealer,Manual,0


In this project we have a 301 * 9 dataset. this dataset is about cars and we are going to clean, analyze and try to make a model for it.
_____
Columns:
1. Car_Name: Name of the car
2. Year: year of the car when it was bought
3. Selling Price: Price at which the car is being sold (target)
4. Kms Driven:Number of kilometers the car is driven
5. Fuel Type: Fuel of the car
6. Seller Type: Seller Type
7. Transmission: Gear transmission of the car
8. Owner: Number of previous owners of the 
9. Present Price: Car price at the moment

In [4]:
#describing data for more details
df.describe(include='all')

Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
count,301,301.0,301.0,301.0,301.0,301,301,301,301.0
unique,98,,,,,3,2,2,
top,city,,,,,Petrol,Dealer,Manual,
freq,26,,,,,239,195,261,
mean,,2013.627907,4.661296,7.628472,36947.20598,,,,0.043189
std,,2.891554,5.082812,8.644115,38886.883882,,,,0.247915
min,,2003.0,0.1,0.32,500.0,,,,0.0
25%,,2012.0,0.9,1.2,15000.0,,,,0.0
50%,,2014.0,3.6,6.4,32000.0,,,,0.0
75%,,2016.0,6.0,9.9,48767.0,,,,0.0


It seems that we don't have mising values and all values are positive 

In [5]:
# getting the number of missing values
df.isnull().sum()

Car_Name         0
Year             0
Selling_Price    0
Present_Price    0
Kms_Driven       0
Fuel_Type        0
Seller_Type      0
Transmission     0
Owner            0
dtype: int64

In [6]:
# knowing the type of the columns
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 301 entries, 0 to 300
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Car_Name       301 non-null    object 
 1   Year           301 non-null    int64  
 2   Selling_Price  301 non-null    float64
 3   Present_Price  301 non-null    float64
 4   Kms_Driven     301 non-null    int64  
 5   Fuel_Type      301 non-null    object 
 6   Seller_Type    301 non-null    object 
 7   Transmission   301 non-null    object 
 8   Owner          301 non-null    int64  
dtypes: float64(2), int64(3), object(4)
memory usage: 21.3+ KB


In [7]:
df.columns

Index(['Car_Name', 'Year', 'Selling_Price', 'Present_Price', 'Kms_Driven',
       'Fuel_Type', 'Seller_Type', 'Transmission', 'Owner'],
      dtype='object')

In [8]:
df.drop(columns='Car_Name',axis='1',inplace=True)

## 2.Values and Columns

In [9]:
Column_list = ('Year', 'Selling_Price', 'Present_Price', 'Kms_Driven',
       'Fuel_Type', 'Seller_Type', 'Transmission', 'Owner')
for i in Column_list:
    print(f'Number of unique values for column: {i} are ({df[i].nunique()}) and values are {df[i].unique()},\n')

Number of unique values for column: Year are (16) and values are [2014 2013 2017 2011 2018 2015 2016 2009 2010 2012 2003 2008 2006 2005
 2004 2007],

Number of unique values for column: Selling_Price are (156) and values are [ 3.35  4.75  7.25  2.85  4.6   9.25  6.75  6.5   8.75  7.45  6.85  7.5
  6.1   2.25  7.75  3.25  2.65  4.9   4.4   2.5   2.9   3.    4.15  6.
  1.95  3.1   2.35  4.95  5.5   2.95  4.65  0.35  5.85  2.55  1.25  1.05
  5.8  14.9  23.   18.   16.    2.75  3.6   4.5   4.1  19.99  6.95 18.75
 23.5  33.   19.75  4.35 14.25  3.95  1.5   5.25 14.5  14.73 12.5   3.49
 35.    5.9   3.45  3.8  11.25  3.51  4.   20.75 17.    7.05  9.65  1.75
  1.7   1.65  1.45  1.35  1.2   1.15  1.11  1.1   1.    0.95  0.9   0.75
  0.8   0.78  0.72  0.65  0.6   0.55  0.52  0.51  0.5   0.48  0.45  0.42
  0.4   0.38  0.31  0.3   0.27  0.25  0.2   0.18  0.17  0.16  0.15  0.12
  0.1   5.75  5.15  7.9   4.85 11.75  3.15  6.45  3.5   8.25  5.11  2.7
  6.15 11.45  3.9   9.1   4.8   2.    5.35  6.25 

Instead of Year that car has been made, we want to use age of car

In [10]:
df['Year'].max()

2018

In [11]:
dfage = 2019 - df.Year # finding the age of the car
df.insert(0,'Age',dfage) # inserting values to dataframe

In [12]:
df.drop(columns='Year',axis=1,inplace=True) # droping the Year column
df

Unnamed: 0,Age,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
0,5,3.35,5.59,27000,Petrol,Dealer,Manual,0
1,6,4.75,9.54,43000,Diesel,Dealer,Manual,0
2,2,7.25,9.85,6900,Petrol,Dealer,Manual,0
3,8,2.85,4.15,5200,Petrol,Dealer,Manual,0
4,5,4.60,6.87,42450,Diesel,Dealer,Manual,0
...,...,...,...,...,...,...,...,...
296,3,9.50,11.60,33988,Diesel,Dealer,Manual,0
297,4,4.00,5.90,60000,Petrol,Dealer,Manual,0
298,10,3.35,11.00,87934,Petrol,Dealer,Manual,0
299,2,11.50,12.50,9000,Diesel,Dealer,Manual,0


In [13]:
df.sort_values(by='Age',inplace=True)
df

Unnamed: 0,Age,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
5,1,9.25,9.83,2071,Diesel,Dealer,Manual,0
64,2,33.00,36.23,6000,Diesel,Dealer,Automatic,0
128,2,0.80,0.87,3000,Petrol,Individual,Manual,0
220,2,3.50,4.43,38488,Petrol,Dealer,Manual,0
126,2,0.90,0.95,1300,Petrol,Individual,Manual,0
...,...,...,...,...,...,...,...,...
92,14,3.51,13.70,75000,Petrol,Dealer,Manual,0
84,14,3.49,13.46,197176,Diesel,Dealer,Manual,0
77,15,1.50,12.35,135154,Petrol,Dealer,Automatic,0
37,16,0.35,2.28,127000,Petrol,Individual,Manual,0


Now we have cars from 1 year old to 16 years old

In [14]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go

In [15]:
df.columns

Index(['Age', 'Selling_Price', 'Present_Price', 'Kms_Driven', 'Fuel_Type',
       'Seller_Type', 'Transmission', 'Owner'],
      dtype='object')

In [16]:
Column_list = ('Age', 'Present_Price', 'Kms_Driven', 'Fuel_Type', 'Seller_Type', 'Transmission', 'Owner')

# Define the number of rows and columns
num_rows = 4
num_cols = 2

fig = make_subplots(rows=num_rows, cols=num_cols, subplot_titles=Column_list)

# Loop through each column and add a scatter plot to the corresponding subplot
for idx, col in enumerate(Column_list):
    row_num = (idx // num_cols) + 1  # Calculate the row number
    col_num = (idx % num_cols) + 1   # Calculate the column number

    fig.add_trace(
        go.Scatter(x=df[col], y=df['Selling_Price'],mode='markers'),
        row=row_num, col=col_num
    )

# Update layout and show the plot
fig.update_layout(height=800, width=1000, title_text="Side By Side Subplots: Featurs by Selling Price")
fig.show()


Some values are seem to be Noises but they are just outlier datas, so we are not gonna drop them

## 3.Ploting and Analysis

In [17]:
# more details about first 10 values
df.head(10)

Unnamed: 0,Age,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
5,1,9.25,9.83,2071,Diesel,Dealer,Manual,0
64,2,33.0,36.23,6000,Diesel,Dealer,Automatic,0
128,2,0.8,0.87,3000,Petrol,Individual,Manual,0
220,2,3.5,4.43,38488,Petrol,Dealer,Manual,0
126,2,0.9,0.95,1300,Petrol,Individual,Manual,0
52,2,18.0,19.77,15000,Diesel,Dealer,Automatic,0
27,2,6.0,6.49,16200,Petrol,Individual,Manual,0
265,2,6.5,8.7,21200,Petrol,Dealer,Manual,0
49,2,7.75,9.29,37000,Petrol,Dealer,Automatic,0
206,2,5.75,7.13,12479,Petrol,Dealer,Manual,0


1. Here cars are newer
2. Selling price differse between 0.80 to 33.00
3. Present price differse between 0.87 to 36.23
4. Car prices has rised since being sold (slling price and present price)
5. Dealers have sold cars in more prices than individuals
7. Most of the new cars are petrol (7 of 10) and rest are diesel
8. All of these cars are first hand
9. 2 of the 4 most valued cars are automatic

In [18]:
# more details about bottom 10 values
df.tail(10)

Unnamed: 0,Age,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
85,13,2.5,23.73,142000,Petrol,Individual,Automatic,3
47,13,1.05,4.15,65000,Petrol,Dealer,Manual,0
200,13,0.1,0.75,92233,Petrol,Individual,Manual,0
54,14,2.75,10.21,90000,Petrol,Individual,Manual,0
189,14,0.2,0.57,55000,Petrol,Individual,Manual,0
92,14,3.51,13.7,75000,Petrol,Dealer,Manual,0
84,14,3.49,13.46,197176,Diesel,Dealer,Manual,0
77,15,1.5,12.35,135154,Petrol,Dealer,Automatic,0
37,16,0.35,2.28,127000,Petrol,Individual,Manual,0
39,16,2.25,7.98,62000,Petrol,Dealer,Manual,0


1. here cars are older
2. selling price differse between 0.10 to 3.51
3. present price differse between 0.57 to 23.73
4. Car prices has rised since being sold (selling price and present price)
5. Dealers and individuals hsa sold equal number of cars.
7. most of the cars are petrol (9 of 10) and just one is diesel.
8. all of these cars are first hand exept one car that has been owend by 3 people
9. 2 of the 3 most valued cars are Manual	

In [19]:
def pplot(XC):
    fig2 = px.scatter(df, x=XC, y='Selling_Price',color = 'Present_Price',title=f'Selling Price by {XC}',template="plotly_dark", color_continuous_scale='Picnic') # setting the plot type
    fig2.update_traces(marker_size=4)
    fig2.show() # showing the plot


In [20]:
pplot('Age')

1. Newer cars price more than old ones
2. Newer cars price more when solled in general
3. Price of cars has got up since been sold but over all it gets down by time
4. Some of the least priced cars are the newer ones!
5. Max of selling price is 35 and the Min one has been sold for 0.1
6. Most new car is 1 year age and the oldest one is 16 years old
7. Selling price mean is 4.66
8. All cars after 8 years have been sold for less than mean of selling price
9. Mean of Age is 5.37
10. Cars befor Mean have a higher Selling and Present price

In [21]:
df['Selling_Price'].min()

0.1

In [22]:
df['Selling_Price'].mean()

4.661295681063123

In [23]:
df['Age'].mean()

5.372093023255814

In [24]:
fig3 = px.scatter(df, x='Present_Price', y='Selling_Price',color = 'Age',title=f'Selling Price by Present_Price',template="plotly_dark", color_continuous_scale='Picnic') # setting the plot type
fig3.update_traces(marker_size=4)
fig3.show() # showing the plot

1. Both charts are bullish, higher Selling price == Higher Present price
2. Present price max is 92.6 and min is 0.32
3. Present price mean is 7.62
4. Most of cars are placed below Present price mean
5. Most of cars are placed below Selling price mean
6. Cars with most value aged between 3 and 6

In [25]:
df['Present_Price'].min()

0.32

In [26]:
df['Present_Price'].mean()

7.628471760797343

In [27]:
pplot('Kms_Driven')

1. Cars with less value have been driven less
2. Most of cars have been driven less than 50,000 kms
3. Some of the highest values have been driven more than 70,000kms and 100,000kms
4. Most drived car has traveld 500,000 kms and has a selling price of 0.17
5. Lessn drived car has traveld 500kms
6. Mean of kms driven is 36,947.20
7. Best Sold price car has been driven 78,000kms
8. Worsed Sold price car hasz been driven 92,233kms

In [28]:
df['Kms_Driven'].mean()

36947.20598006644

In [29]:
pplot('Fuel_Type')

1. Most cars run with Petrol(239) and just (2) of them use CNG and Diesel with (60) places secend
2. Diesel cars sell with a higher price
3. Diesel crs maintain thire price better
3. Starting price for Diesel cars is 3.1 to 35
4. Starting price for Diesel cars is 0.1 to 19.76
5. Starting price for CNG cars is 2.95 to 3.25

In [30]:
df[df['Fuel_Type']== 'Petrol'].value_counts().sum()

239

In [31]:
df[df['Fuel_Type']== 'Diesel'].value_counts().sum()

60

In [32]:
df[df['Fuel_Type']== 'CNG'].value_counts().sum()

2

In [33]:
pplot('Seller_Type')

1. Dealers have sold more cars (195) than Individuals (106)
2. Dealers have sold cars in more price range
3. Dealers have sold more expensive cars
4. Dealers have sold more new cars than individuals

In [34]:
df[df['Seller_Type']== 'Individual'].value_counts().sum()

106

In [35]:
df[df['Seller_Type']== 'Dealer'].value_counts().sum()

195

In [36]:
pplot('Transmission')

1. There are more cars with manual Transmisson (261) than auto cars (40)
2. Auto cars have wider range of price but the highest one is a Manual car
3. Auto section has a better New/Old ratio
4. Age is more important for pricing in manual than auto

In [37]:
df[df['Transmission']== 'Automatic'].value_counts().sum()

40

In [38]:
df[df['Transmission']== 'Manual'].value_counts().sum()

261

In [39]:
pplot('Owner')

1. Most cars just had one owner
2. Just one car had 3 owner
3. Owner count seems to be important in price of car
4. More new cars are owned by just one owner

## 4.Model

Becasuse we are creating a Regression model we have to change string values to numeric values

In [40]:
df3 = df # creating a new data frame
df3['Fuel_Type'] = df3['Fuel_Type'].replace({'Diesel':2,'Petrol':3,'CNG':4}) # converting to numeric
df3['Seller_Type'] = df3['Seller_Type'].replace({'Dealer':2,'Individual':3}) # converting to numeric
df3['Transmission'] = df3['Transmission'].replace({'Manual':2,'Automatic':3}) # converting to numeric
df3

Unnamed: 0,Age,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
5,1,9.25,9.83,2071,2,2,2,0
64,2,33.00,36.23,6000,2,2,3,0
128,2,0.80,0.87,3000,3,3,2,0
220,2,3.50,4.43,38488,3,2,2,0
126,2,0.90,0.95,1300,3,3,2,0
...,...,...,...,...,...,...,...,...
92,14,3.51,13.70,75000,3,2,2,0
84,14,3.49,13.46,197176,2,2,2,0
77,15,1.50,12.35,135154,3,2,3,0
37,16,0.35,2.28,127000,3,3,2,0


In [41]:
# data type and missing values
df3.info()

<class 'pandas.core.frame.DataFrame'>
Index: 301 entries, 5 to 39
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Age            301 non-null    int64  
 1   Selling_Price  301 non-null    float64
 2   Present_Price  301 non-null    float64
 3   Kms_Driven     301 non-null    int64  
 4   Fuel_Type      301 non-null    int64  
 5   Seller_Type    301 non-null    int64  
 6   Transmission   301 non-null    int64  
 7   Owner          301 non-null    int64  
dtypes: float64(2), int64(6)
memory usage: 21.2 KB


In [42]:
df3.columns

Index(['Age', 'Selling_Price', 'Present_Price', 'Kms_Driven', 'Fuel_Type',
       'Seller_Type', 'Transmission', 'Owner'],
      dtype='object')

Normalizing dataframe

In [45]:
Scaler = MinMaxScaler(feature_range=(0,1))
norm1 = Scaler.fit_transform(df3)
df_norm = pd.DataFrame(norm1, columns=['Age', 'Selling_Price', 'Present_Price', 'Kms_Driven', 'Fuel_Type',
       'Seller_Type', 'Transmission', 'Owner'])

In [46]:
df_norm

Unnamed: 0,Age,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
0,0.000000,0.262178,0.103056,0.003145,0.0,0.0,0.0,0.0
1,0.066667,0.942693,0.389142,0.011011,0.0,0.0,1.0,0.0
2,0.066667,0.020057,0.005960,0.005005,0.5,1.0,0.0,0.0
3,0.066667,0.097421,0.044538,0.076052,0.5,0.0,0.0,0.0
4,0.066667,0.022923,0.006827,0.001602,0.5,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...
296,0.866667,0.097708,0.144993,0.149149,0.5,0.0,0.0,0.0
297,0.866667,0.097135,0.142393,0.393746,0.0,0.0,0.0,0.0
298,0.933333,0.040115,0.130364,0.269578,0.5,0.0,1.0,0.0
299,1.000000,0.007163,0.021240,0.253253,0.5,1.0,0.0,0.0


In [47]:
#setting the x and y for model
x = pd.DataFrame(df_norm,columns=['Age','Present_Price', 'Kms_Driven', 'Fuel_Type',
       'Seller_Type', 'Transmission', 'Owner' ])
y = df3['Selling_Price'].values.reshape(-1,1)

In [48]:
# setting the train and test values
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=0)

In [49]:
# choosing the model
model = LinearRegression()

In [50]:
# training the model
model.fit(x_train,y_train)

In [51]:
# settin the predection values
y_pred = model.predict(x_test)

In [52]:
result = model.score(x_test,y_test)
print(result)

0.8604228341385417


In [53]:
print("Mean Absolut Error: ", metrics.mean_absolute_error(y_test,y_pred))
print("Mean Suared Error: ", metrics.mean_squared_error(y_test,y_pred))
print("Root Mean Squared Error: ", np.sqrt(metrics.mean_squared_error(y_test,y_pred)))
print("R2 Score: ", metrics.r2_score(y_test,y_pred))

Mean Absolut Error:  1.1326137667706697
Mean Suared Error:  2.4859479613401985
Root Mean Squared Error:  1.5766889234532595
R2 Score:  0.8604228341385417


### Improving model

In [54]:
model_new = LinearRegression()

using kfold and cross validation

In [55]:
kfold_validation = KFold(10)
results = cross_val_score(model_new,x,y.ravel(),cv=kfold_validation,n_jobs=1)
print(results)
print(np.mean(results))

[ 0.82867668  0.8541822   0.83300456  0.95437113  0.88731956  0.9528129
  0.87846075  0.89489722  0.76455446 -1.10359991]
0.6744679563703119


In [56]:
len(df3)/10

30.1

In [57]:
df_norm1 = df_norm[df_norm.index< 270]
df_norm1.reset_index(drop=True,inplace=True)
df_norm1

Unnamed: 0,Age,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
0,0.000000,0.262178,0.103056,0.003145,0.0,0.0,0.0,0.0
1,0.066667,0.942693,0.389142,0.011011,0.0,0.0,1.0,0.0
2,0.066667,0.020057,0.005960,0.005005,0.5,1.0,0.0,0.0
3,0.066667,0.097421,0.044538,0.076052,0.5,0.0,0.0,0.0
4,0.066667,0.022923,0.006827,0.001602,0.5,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...
265,0.533333,0.004298,0.002167,0.043043,0.5,1.0,1.0,0.0
266,0.533333,0.010029,0.006827,0.053053,0.5,1.0,0.0,0.0
267,0.533333,0.061605,0.077807,0.121528,0.5,0.0,0.0,0.0
268,0.533333,0.001719,0.002059,0.065065,0.5,1.0,0.0,0.0


In [58]:
#setting the x and y for model
xn = pd.DataFrame(df_norm1,columns=['Age','Present_Price', 'Kms_Driven', 'Fuel_Type',
       'Seller_Type', 'Transmission', 'Owner' ])
yn = df_norm1['Selling_Price'].values.reshape(-1,1)
# setting the train and test values
x_trainn, x_testn, y_trainn, y_testn = train_test_split(xn,yn,test_size=0.2,random_state=0)
# choosing the model
modeln = LinearRegression()
# training the model
modeln.fit(x_trainn,y_trainn)

In [59]:
# settin the predection values
y_predn = modeln.predict(x_testn)

In [60]:
print("Mean Absolut Error: ", metrics.mean_absolute_error(y_testn,y_predn))
print("Mean Suared Error: ", metrics.mean_squared_error(y_testn,y_predn))
print("Root Mean Squared Error: ", np.sqrt(metrics.mean_squared_error(y_testn,y_predn)))
print("R2 Score: ", metrics.r2_score(y_testn,y_predn))

Mean Absolut Error:  0.027971961776134775
Mean Suared Error:  0.0013834173065942017
Root Mean Squared Error:  0.037194318203110026
R2 Score:  0.9183319883197173


5% improvment

In [61]:
print('intercept: ',modeln.intercept_)
print('model coefs: ', modeln.coef_)

intercept:  [0.15382867]
model coefs:  [[-0.19434168  1.20315129 -0.22000618 -0.08923517 -0.05053681  0.04720216
   0.0115668 ]]


Adding dimentions to featuers

In [62]:
def check(Dimension,Testsize):
    r2 = 0.9183319883197173
    for column in xn:
        new_col_name = column + str(Dimension)
        new_col_val = xn[column]**Dimension
        xn.insert(0,new_col_name,new_col_val)
        x_tr, x_te, y_tr, y_te = train_test_split(xn,yn,test_size=Testsize,random_state=0)
        new_model = LinearRegression()
        new_model.fit(x_tr,y_tr)
        y_pr = new_model.predict(x_te)
        r2_new = metrics.r2_score(y_te, y_pr)
        if r2_new < r2:
            xn.drop([new_col_name],axis=1,inplace=True)
        else:
            r2 = r2_new
    print("r2 s :", r2)

In [63]:
check(2,0.2)

r2 s : 0.923308993484282


In [64]:
check(3,0.2)

r2 s : 0.9310825810929142


In [65]:
check(4,0.2)

r2 s : 0.9312791242296944


2% improvment

In [66]:
xn

Unnamed: 0,Seller_Type4,Fuel_Type234,Seller_Type34,Owner34,Owner3,Seller_Type3,Fuel_Type3,Present_Price3,Present_Price23,Fuel_Type23,Fuel_Type2,Present_Price2,Age,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
0,0.0,0.000000e+00,0.0,0.0,0.0,0.0,0.000,1.094508e-03,1.197947e-06,0.000000,0.00,0.010621,0.000000,0.103056,0.003145,0.0,0.0,0.0,0.0
1,0.0,0.000000e+00,0.0,0.0,0.0,0.0,0.000,5.892824e-02,3.472537e-03,0.000000,0.00,0.151431,0.066667,0.389142,0.011011,0.0,0.0,1.0,0.0
2,1.0,5.960464e-08,1.0,0.0,0.0,1.0,0.125,2.117217e-07,4.482607e-14,0.015625,0.25,0.000036,0.066667,0.005960,0.005005,0.5,1.0,0.0,0.0
3,0.0,5.960464e-08,0.0,0.0,0.0,0.0,0.125,8.834922e-05,7.805584e-09,0.015625,0.25,0.001984,0.066667,0.044538,0.076052,0.5,0.0,0.0,0.0
4,1.0,5.960464e-08,1.0,0.0,0.0,1.0,0.125,3.181991e-07,1.012506e-13,0.015625,0.25,0.000047,0.066667,0.006827,0.001602,0.5,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
265,1.0,5.960464e-08,1.0,0.0,0.0,1.0,0.125,1.018046e-08,1.036417e-16,0.015625,0.25,0.000005,0.533333,0.002167,0.043043,0.5,1.0,1.0,0.0
266,1.0,5.960464e-08,1.0,0.0,0.0,1.0,0.125,3.181991e-07,1.012506e-13,0.015625,0.25,0.000047,0.533333,0.006827,0.053053,0.5,1.0,0.0,0.0
267,0.0,5.960464e-08,0.0,0.0,0.0,0.0,0.125,4.710322e-04,2.218713e-07,0.015625,0.25,0.006054,0.533333,0.077807,0.121528,0.5,0.0,0.0,0.0
268,1.0,5.960464e-08,1.0,0.0,0.0,1.0,0.125,8.728468e-09,7.618616e-17,0.015625,0.25,0.000004,0.533333,0.002059,0.065065,0.5,1.0,0.0,0.0


Multiplying columns together

In [67]:
pres_kms = xn['Present_Price']*xn['Kms_Driven']
pres_fuel= xn['Present_Price']*xn['Fuel_Type']
pres_fuel2= xn['Present_Price']*xn['Fuel_Type2']
pres2_kms = xn['Present_Price2']*xn['Kms_Driven']
pres2_fuel= xn['Present_Price2']*xn['Fuel_Type']
pres2_fuel2= xn['Present_Price2']*xn['Fuel_Type2']

In [68]:
xnn = xn

In [69]:
xnn.insert(0,"pres_kms",pres_kms)
xnn.insert(0,"pres_fuel",pres_fuel)
xnn.insert(0,"pres_fuel2",pres_fuel2)
xnn.insert(0,"pres2_kms",pres2_kms)
xnn.insert(0,"pres2_fuel",pres2_fuel)
xnn.insert(0,"pres2_fuel2",pres2_fuel2)

In [70]:
X_train, X_test, Y_train, Y_test = train_test_split(xnn,yn,test_size=0.2,random_state=0)
Model = LinearRegression()
Model.fit(X_train,Y_train)
Y_pred = Model.predict(X_test)
r2 = metrics.r2_score(Y_test,Y_pred)
print("Mean Absolut Error: ", metrics.mean_absolute_error(Y_test,Y_pred))
print("Mean Suared Error: ", metrics.mean_squared_error(Y_test,Y_pred))
print("Root Mean Squared Error: ", np.sqrt(metrics.mean_squared_error(Y_test,Y_pred)))
print("R2 Score: ", r2)

Mean Absolut Error:  0.018586148220115152
Mean Suared Error:  0.000703214844132
Root Mean Squared Error:  0.02651819835758078
R2 Score:  0.9584867430596874


2% improvment

In [71]:
xnn.columns

Index(['pres2_fuel2', 'pres2_fuel', 'pres2_kms', 'pres_fuel2', 'pres_fuel',
       'pres_kms', 'Seller_Type4', 'Fuel_Type234', 'Seller_Type34', 'Owner34',
       'Owner3', 'Seller_Type3', 'Fuel_Type3', 'Present_Price3',
       'Present_Price23', 'Fuel_Type23', 'Fuel_Type2', 'Present_Price2', 'Age',
       'Present_Price', 'Kms_Driven', 'Fuel_Type', 'Seller_Type',
       'Transmission', 'Owner'],
      dtype='object')

In [72]:
xnn.head(1)

Unnamed: 0,pres2_fuel2,pres2_fuel,pres2_kms,pres_fuel2,pres_fuel,pres_kms,Seller_Type4,Fuel_Type234,Seller_Type34,Owner34,...,Fuel_Type23,Fuel_Type2,Present_Price2,Age,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
0,0.0,0.0,3.3e-05,0.0,0.0,0.000324,0.0,0.0,0.0,0.0,...,0.0,0.0,0.010621,0.0,0.103056,0.003145,0.0,0.0,0.0,0.0


In [73]:
pres_kmsn = xnn['Present_Price2']*xnn['pres_kms']
pres_fueln= xnn['Present_Price2']*xnn['pres_fuel']
pres_fuel2n= xnn['Present_Price2']*xnn['pres_fuel2']
pres2_kmsn = xnn['Present_Price23']*xnn['pres2_kms']
pres2_fueln= xnn['Present_Price23']*xnn['pres2_fuel']
pres2_fuel2n= xnn['Present_Price23']*xnn['pres2_fuel2']

In [74]:
xnnn = xnn

In [75]:
xnnn.insert(0,"pres_kmsn",pres_kmsn)
xnnn.insert(0,"pres_fueln",pres_fueln)
xnnn.insert(0,"pres_fuel2n",pres_fuel2n)
xnnn.insert(0,"pres2_kmsn",pres2_kmsn)
xnnn.insert(0,"pres2_fueln",pres2_fueln)
xnnn.insert(0,"pres2_fuel2n",pres2_fuel2n)

In [76]:
X_train, X_test, Y_train, Y_test = train_test_split(xnnn,yn,test_size=0.2,random_state=0)
Model = LinearRegression()
Model.fit(X_train,Y_train)
Y_pred = Model.predict(X_test)
r2 = metrics.r2_score(Y_test,Y_pred)
print("Mean Absolut Error: ", metrics.mean_absolute_error(Y_test,Y_pred))
print("Mean Suared Error: ", metrics.mean_squared_error(Y_test,Y_pred))
print("Root Mean Squared Error: ", np.sqrt(metrics.mean_squared_error(Y_test,Y_pred)))
print("R2 Score: ", r2)

Mean Absolut Error:  0.01956216611058778
Mean Suared Error:  0.0006965267952166077
Root Mean Squared Error:  0.026391794088629284
R2 Score:  0.9588815622182573


we have improved og model with 86% score to 95.8 % score, about 10% improvment!