In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("/content/mpg.csv")

In [3]:
df.dtypes

mpg             float64
cylinders         int64
displacement    float64
horsepower       object
weight            int64
acceleration    float64
model_year        int64
origin            int64
name             object
dtype: object

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    398 non-null    object 
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model_year    398 non-null    int64  
 7   origin        398 non-null    int64  
 8   name          398 non-null    object 
dtypes: float64(3), int64(4), object(2)
memory usage: 28.1+ KB


In [5]:
df.shape

(398, 9)

Data Cleaning Step

In [8]:
print (df[pd.to_numeric(df['horsepower'], errors='coerce').isnull()])

# in place of errors we can use more options, errors : {'ignore', 'raise', 'coerce'}
# default 'raise' -> If 'raise', then invalid parsing will raise an exception. -> 
# If 'coerce', then invalid parsing will be set as NaN. -> If 'ignore', then invalid parsing will return the input.

      mpg  cylinders  displacement  ... model_year  origin                  name
32   25.0          4          98.0  ...         71       1            ford pinto
126  21.0          6         200.0  ...         74       1         ford maverick
330  40.9          4          85.0  ...         80       2  renault lecar deluxe
336  23.6          4         140.0  ...         80       1    ford mustang cobra
354  34.5          4         100.0  ...         81       2           renault 18i
374  23.0          4         151.0  ...         82       1        amc concord dl

[6 rows x 9 columns]


Cleaning, from the particular column

In [10]:
df['horsepower'] = pd.to_numeric(df['horsepower'], errors='coerce')
cols = df.columns

In [11]:
cols

Index(['mpg', 'cylinders', 'displacement', 'horsepower', 'weight',
       'acceleration', 'model_year', 'origin', 'name'],
      dtype='object')

Making all the unwanted value as NaN

In [12]:
df[cols] = df[cols].apply(pd.to_numeric, errors='coerce') 
df.applymap(np.isreal)
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
0,18.0,8,307.0,130.0,3504,12.0,70,1,
1,15.0,8,350.0,165.0,3693,11.5,70,1,
2,18.0,8,318.0,150.0,3436,11.0,70,1,
3,16.0,8,304.0,150.0,3433,12.0,70,1,
4,17.0,8,302.0,140.0,3449,10.5,70,1,


Dropping all the unnecessary columns

In [13]:
df = df.drop(['name','origin','model_year'], axis=1)
df = df.replace('?', np.nan)
df = df.dropna()

Separating the dependent variable(y) and independent variable(x)

In [14]:
X = df.drop('mpg', axis=1) 
y = df[['mpg']]

In [15]:
X

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration
0,8,307.0,130.0,3504,12.0
1,8,350.0,165.0,3693,11.5
2,8,318.0,150.0,3436,11.0
3,8,304.0,150.0,3433,12.0
4,8,302.0,140.0,3449,10.5
...,...,...,...,...,...
393,4,140.0,86.0,2790,15.6
394,4,97.0,52.0,2130,24.6
395,4,135.0,84.0,2295,11.6
396,4,120.0,79.0,2625,18.6


In [16]:
y

Unnamed: 0,mpg
0,18.0
1,15.0
2,18.0
3,16.0
4,17.0
...,...
393,27.0
394,44.0
395,32.0
396,28.0


Drop all the Nan values from the dataset

In [17]:
df=df.dropna()

Making training and testing dataset

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

Linear regression and finding the best fit line for only one column

In [19]:
reg = LinearRegression()
reg.fit(X_train[['horsepower']], y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

Predicting the value

In [20]:
y_predicted = reg.predict(X_test[['horsepower']])

In [21]:
y_predicted

array([[24.34050938],
       [27.81451133],
       [27.81451133],
       [21.97187169],
       [29.07778477],
       [26.5512379 ],
       [28.76196641],
       [15.97132287],
       [25.13005528],
       [28.28823887],
       [21.49814415],
       [27.65660215],
       [19.12950646],
       [28.60405723],
       [26.39332872],
       [17.55041467],
       [26.39332872],
       [32.07805918],
       [ 4.91768031],
       [23.86678185],
       [27.81451133],
       [22.28769005],
       [15.97132287],
       [12.81313928],
       [ 8.39168226],
       [15.97132287],
       [28.91987559],
       [23.86678185],
       [22.28769005],
       [26.39332872],
       [16.76086877],
       [25.761692  ],
       [15.18177698],
       [21.97187169],
       [23.07723595],
       [17.55041467],
       [24.65632774],
       [23.07723595],
       [29.07778477],
       [25.761692  ],
       [ 9.6549557 ],
       [12.02359339],
       [20.3927799 ],
       [23.86678185],
       [23.86678185],
       [22

Evalution matrices

In [22]:
print("Mean squared error: %.2f" % mean_squared_error(y_test, y_predicted))

print('R²: %.2f' % r2_score(y_test, y_predicted))

Mean squared error: 28.66
R²: 0.59


Finding the best fit line for more than one column

In [23]:
reg.fit(X_train[['horsepower','weight','cylinders']], y_train)
y_predicted = reg.predict(X_test[['horsepower','weight','cylinders']])

print("Mean squared error: %.2f" % mean_squared_error(y_test, y_predicted))
print('R²: %.2f' % r2_score(y_test, y_predicted))

Mean squared error: 19.12
R²: 0.72
