In [1]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import pandas as pd

In [2]:
cars = pd.read_csv("car-df.csv")
cars

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car_name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino
...,...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86,2790,15.6,82,1,ford mustang gl
394,44.0,4,97.0,52,2130,24.6,82,2,vw pickup
395,32.0,4,135.0,84,2295,11.6,82,1,dodge rampage
396,28.0,4,120.0,79,2625,18.6,82,1,ford ranger


In [3]:
cars.shape

(398, 9)

In [4]:
cars.sample(15)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car_name
396,28.0,4,120.0,79,2625,18.6,82,1,ford ranger
49,23.0,4,122.0,86,2220,14.0,71,1,mercury capri 2000
21,24.0,4,107.0,90,2430,14.5,70,2,audi 100 ls
240,30.5,4,97.0,78,2190,14.1,77,2,volkswagen dasher
366,17.6,6,225.0,85,3465,16.6,81,1,chrysler lebaron salon
9,15.0,8,390.0,190,3850,8.5,70,1,amc ambassador dpl
189,15.5,8,304.0,120,3962,13.9,76,1,amc matador
384,32.0,4,91.0,67,1965,15.7,82,3,honda civic (auto)
217,30.0,4,111.0,80,2155,14.8,77,1,buick opel isuzu deluxe
324,40.8,4,85.0,65,2110,19.2,80,3,datsun 210


In [5]:
#dropping/ignoring name
cars = cars.drop('car_name', axis=1)
cars

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin
0,18.0,8,307.0,130,3504,12.0,70,1
1,15.0,8,350.0,165,3693,11.5,70,1
2,18.0,8,318.0,150,3436,11.0,70,1
3,16.0,8,304.0,150,3433,12.0,70,1
4,17.0,8,302.0,140,3449,10.5,70,1
...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86,2790,15.6,82,1
394,44.0,4,97.0,52,2130,24.6,82,2
395,32.0,4,135.0,84,2295,11.6,82,1
396,28.0,4,120.0,79,2625,18.6,82,1


In [6]:
#Replacing the categorical var with actual values
cars["origin"] = cars["origin"].replace({1:"America", 2:"Europe", 3:"Asia"})
cars.sample(10)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin
13,14.0,8,455.0,225,3086,10.0,70,America
26,10.0,8,307.0,200,4376,15.0,70,America
90,12.0,8,429.0,198,4952,11.5,73,America
380,36.0,4,120.0,88,2160,14.5,82,Asia
385,38.0,4,91.0,67,1995,16.2,82,Asia
320,37.0,4,119.0,92,2434,15.0,80,Asia
272,23.8,4,151.0,85,2855,17.6,78,America
278,31.5,4,89.0,71,1990,14.9,78,Europe
211,16.5,6,168.0,120,3820,16.7,76,Europe
88,14.0,8,302.0,137,4042,14.5,73,America


In [7]:
#One hot encoding
cars = pd.get_dummies(cars, columns=['origin'])
cars.sample(10)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin_America,origin_Asia,origin_Europe
170,23.0,4,140.0,78,2592,18.5,75,1,0,0
144,31.0,4,76.0,52,1649,16.5,74,0,1,0
2,18.0,8,318.0,150,3436,11.0,70,1,0,0
344,39.0,4,86.0,64,1875,16.4,81,1,0,0
16,18.0,6,199.0,97,2774,15.5,70,1,0,0
216,31.5,4,98.0,68,2045,18.5,77,0,1,0
297,25.4,5,183.0,77,3530,20.1,79,0,0,1
90,12.0,8,429.0,198,4952,11.5,73,1,0,0
245,36.1,4,98.0,66,1800,14.4,78,1,0,0
88,14.0,8,302.0,137,4042,14.5,73,1,0,0


In [8]:
cars.describe()

Unnamed: 0,mpg,cylinders,displacement,weight,acceleration,model year,origin_America,origin_Asia,origin_Europe
count,398.0,398.0,398.0,398.0,398.0,398.0,398.0,398.0,398.0
mean,23.514573,5.454774,193.425879,2970.424623,15.56809,76.01005,0.625628,0.198492,0.175879
std,7.815984,1.701004,104.269838,846.841774,2.757689,3.697627,0.484569,0.399367,0.381197
min,9.0,3.0,68.0,1613.0,8.0,70.0,0.0,0.0,0.0
25%,17.5,4.0,104.25,2223.75,13.825,73.0,0.0,0.0,0.0
50%,23.0,4.0,148.5,2803.5,15.5,76.0,1.0,0.0,0.0
75%,29.0,8.0,262.0,3608.0,17.175,79.0,1.0,0.0,0.0
max,46.6,8.0,455.0,5140.0,24.8,82.0,1.0,1.0,1.0


In [9]:
#horsepower is missing

In [10]:
cars.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   mpg             398 non-null    float64
 1   cylinders       398 non-null    int64  
 2   displacement    398 non-null    float64
 3   horsepower      398 non-null    object 
 4   weight          398 non-null    int64  
 5   acceleration    398 non-null    float64
 6   model year      398 non-null    int64  
 7   origin_America  398 non-null    uint8  
 8   origin_Asia     398 non-null    uint8  
 9   origin_Europe   398 non-null    uint8  
dtypes: float64(3), int64(3), object(1), uint8(3)
memory usage: 23.1+ KB


In [11]:
# horsepower is "object" due to missing values

In [12]:
hpisdigit = pd.DataFrame(cars.horsepower.str.isdigit())       # if the string is made of digits store True else False
hpisdigit.sample(10)

Unnamed: 0,horsepower
72,True
132,True
260,True
75,True
227,True
328,True
117,True
299,True
201,True
331,True


In [13]:
cars[hpisdigit['horsepower'] == False]           # from temp take only those rows where hp has false

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin_America,origin_Asia,origin_Europe
32,25.0,4,98.0,?,2046,19.0,71,1,0,0
126,21.0,6,200.0,?,2875,17.0,74,1,0,0
330,40.9,4,85.0,?,1835,17.3,80,0,0,1
336,23.6,4,140.0,?,2905,14.3,80,1,0,0
354,34.5,4,100.0,?,2320,15.8,81,0,0,1
374,23.0,4,151.0,?,3035,20.5,82,1,0,0


In [14]:
cars = cars.replace("?", np.nan)
cars[hpisdigit['horsepower'] == False]

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin_America,origin_Asia,origin_Europe
32,25.0,4,98.0,,2046,19.0,71,1,0,0
126,21.0,6,200.0,,2875,17.0,74,1,0,0
330,40.9,4,85.0,,1835,17.3,80,0,0,1
336,23.6,4,140.0,,2905,14.3,80,1,0,0
354,34.5,4,100.0,,2320,15.8,81,0,0,1
374,23.0,4,151.0,,3035,20.5,82,1,0,0


In [15]:
cars.median()

mpg                 23.0
cylinders            4.0
displacement       148.5
horsepower          93.5
weight            2803.5
acceleration        15.5
model year          76.0
origin_America       1.0
origin_Asia          0.0
origin_Europe        0.0
dtype: float64

In [16]:
cars.sample(10)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin_America,origin_Asia,origin_Europe
14,24.0,4,113.0,95,2372,15.0,70,0,1,0
339,26.6,4,151.0,84,2635,16.4,81,1,0,0
201,18.5,6,250.0,110,3645,16.2,76,1,0,0
167,29.0,4,97.0,75,2171,16.0,75,0,1,0
271,23.2,4,156.0,105,2745,16.7,78,1,0,0
365,20.2,6,200.0,88,3060,17.1,81,1,0,0
123,20.0,6,156.0,122,2807,13.5,73,0,1,0
87,13.0,8,350.0,145,3988,13.0,73,1,0,0
363,22.4,6,231.0,110,3415,15.8,81,1,0,0
6,14.0,8,454.0,220,4354,9.0,70,1,0,0


In [17]:
cars['horsepower'].isnull().sum()

6

In [18]:
m=cars['horsepower'].median()
m

93.5

In [19]:
cars['horsepower']=cars['horsepower'].fillna(m)

In [20]:
cars.dtypes

mpg               float64
cylinders           int64
displacement      float64
horsepower         object
weight              int64
acceleration      float64
model year          int64
origin_America      uint8
origin_Asia         uint8
origin_Europe       uint8
dtype: object

In [21]:
cars['horsepower']=cars['horsepower'].astype('float64')

In [22]:
cars.dtypes

mpg               float64
cylinders           int64
displacement      float64
horsepower        float64
weight              int64
acceleration      float64
model year          int64
origin_America      uint8
origin_Asia         uint8
origin_Europe       uint8
dtype: object

In [23]:
x=cars.drop(['mpg'],axis=1)
y=cars[['mpg']]

In [24]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.30,random_state=7)

In [25]:
model=LinearRegression()
model.fit(x_train,y_train)

LinearRegression()

In [29]:
y_pred=model.predict(x_test)

In [30]:
from sklearn.metrics import mean_squared_error

In [31]:
accuracy = mean_squared_error(y_test,y_pred)

In [32]:
accuracy

13.435429159597199

In [33]:
model.score(x_test,y_test)

0.7540657201603549

In [34]:
model.score(x_train,y_train)

0.8454570850891766