## Linear Regression

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv("autompg.data", delim_whitespace=True, header = None, names = ["mpg", "cylinders", "displacement",
                                                                 "horsepower", "weight", "acceleration",
                                                                 "model_year", "origin", "car_name"])
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,car_name
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,ford torino


In [3]:
df.shape

(398, 9)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    398 non-null    object 
 4   weight        398 non-null    float64
 5   acceleration  398 non-null    float64
 6   model_year    398 non-null    int64  
 7   origin        398 non-null    int64  
 8   car_name      398 non-null    object 
dtypes: float64(4), int64(3), object(2)
memory usage: 28.1+ KB


In [5]:
df.describe()

Unnamed: 0,mpg,cylinders,displacement,weight,acceleration,model_year,origin
count,398.0,398.0,398.0,398.0,398.0,398.0,398.0
mean,23.514573,5.454774,193.425879,2970.424623,15.56809,76.01005,1.572864
std,7.815984,1.701004,104.269838,846.841774,2.757689,3.697627,0.802055
min,9.0,3.0,68.0,1613.0,8.0,70.0,1.0
25%,17.5,4.0,104.25,2223.75,13.825,73.0,1.0
50%,23.0,4.0,148.5,2803.5,15.5,76.0,1.0
75%,29.0,8.0,262.0,3608.0,17.175,79.0,2.0
max,46.6,8.0,455.0,5140.0,24.8,82.0,3.0


## DataPreProcessing

In [6]:
df.isnull().sum()

mpg             0
cylinders       0
displacement    0
horsepower      0
weight          0
acceleration    0
model_year      0
origin          0
car_name        0
dtype: int64

In [7]:
df1 = df[df.horsepower == "?"]
df1.shape

(6, 9)

In [8]:
df["horsepower"] = df["horsepower"].replace("?", np.nan)

df["horsepower"] = pd.to_numeric(df["horsepower"])

In [9]:
hp_median = df["horsepower"].median()
df["horsepower"].fillna(hp_median, inplace = True)

In [10]:
df["car_name"].values

array(['chevrolet chevelle malibu', 'buick skylark 320',
       'plymouth satellite', 'amc rebel sst', 'ford torino',
       'ford galaxie 500', 'chevrolet impala', 'plymouth fury iii',
       'pontiac catalina', 'amc ambassador dpl', 'dodge challenger se',
       "plymouth 'cuda 340", 'chevrolet monte carlo',
       'buick estate wagon (sw)', 'toyota corona mark ii',
       'plymouth duster', 'amc hornet', 'ford maverick', 'datsun pl510',
       'volkswagen 1131 deluxe sedan', 'peugeot 504', 'audi 100 ls',
       'saab 99e', 'bmw 2002', 'amc gremlin', 'ford f250', 'chevy c20',
       'dodge d200', 'hi 1200d', 'datsun pl510', 'chevrolet vega 2300',
       'toyota corona', 'ford pinto', 'amc gremlin',
       'plymouth satellite custom', 'chevrolet chevelle malibu',
       'ford torino 500', 'amc matador', 'chevrolet impala',
       'pontiac catalina brougham', 'ford galaxie 500',
       'plymouth fury iii', 'dodge monaco (sw)',
       'ford country squire (sw)', 'pontiac safari (sw)',
 

In [11]:
df["Company_Name"] = df["car_name"].apply(lambda x:x.split()[0])

df["Company_Name"].value_counts()

Company_Name
ford             51
chevrolet        43
plymouth         31
amc              28
dodge            28
toyota           25
datsun           23
buick            17
pontiac          16
volkswagen       15
honda            13
mercury          11
mazda            10
oldsmobile       10
fiat              8
peugeot           8
audi              7
chrysler          6
vw                6
volvo             6
renault           5
saab              4
subaru            4
opel              4
chevy             3
bmw               2
cadillac          2
maxda             2
mercedes-benz     2
triumph           1
vokswagen         1
mercedes          1
hi                1
capri             1
chevroelt         1
toyouta           1
nissan            1
Name: count, dtype: int64

In [12]:
df["Company_Name"] = df["Company_Name"].str.lower()

def replace_name(a,b):
    df["Company_Name"].replace(a,b, inplace = True)

replace_name("chevy", "chevrolet")
replace_name("chevroelt", "chevrolet")
replace_name("maxda", "mazda")
replace_name("mercedes", "mercedes-benz")
replace_name("nissan", "datsun")
replace_name("porschce", "porsche")
replace_name("toyouta", "toyota")
replace_name("vokswagen", "volkswagen")
replace_name("vw", "volkswagen")


df.Company_Name.unique()

array(['chevrolet', 'buick', 'plymouth', 'amc', 'ford', 'pontiac',
       'dodge', 'toyota', 'datsun', 'volkswagen', 'peugeot', 'audi',
       'saab', 'bmw', 'hi', 'mercury', 'opel', 'fiat', 'oldsmobile',
       'chrysler', 'mazda', 'volvo', 'renault', 'honda', 'subaru',
       'capri', 'mercedes-benz', 'cadillac', 'triumph'], dtype=object)

In [13]:
df.drop(["car_name"],axis = 1,inplace = True)

In [14]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df["Company_Name"] = le.fit_transform(df["Company_Name"])

In [15]:
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,Company_Name
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,6
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,3
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,20
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,0
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,11


## Splitting & Predicting

In [16]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression

X = df.drop("mpg", axis = 1)
y = df["mpg"]

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state = 100)

pipeline = Pipeline([
    ("scaler",StandardScaler()),
    ("algorithm", LinearRegression())
])

pipeline.fit(X_train, y_train)

In [18]:
pipeline.fit(X_train, y_train)

In [19]:
y_pred = pipeline.predict(X_test)

## Evaluation

In [20]:
from sklearn.metrics import mean_squared_error, r2_score
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5

In [21]:
print("MSE: ", mse)
print("RMSE: ", rmse)

MSE:  9.826468212113237
RMSE:  3.134719798022343


In [22]:
print("R-Sqrd: ", r2_score(y_test, y_pred))

R-Sqrd:  0.8356384680722512
