
# Model Development

# 1. Predict Car Price based on highway-mpg

In [1]:
import pandas as pd
import numpy as np

In [2]:
path = "Auto85.csv"
df = pd.read_csv(path, header = None) #read_csv() assumes data has a header

In [3]:
headers = ["symboling","normalized-losses","make","fuel-type", "aspiration", "num-of-doors", "body-style", "drive-wheels","engine-location", "wheel-base", "length", "width", "height", "curb-weight", "engine-type", "num-of-cylinders", "engine-size", "fuel-system", "bore", "stroke", "compression-ratio", "horsepower", "peak-rpm", "city-mpg", "highway-mpg", "Price"]

In [4]:
df.columns = headers

In [5]:
df.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,Price
0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


# Preprocessing

- Check highway-mpg and price column
- Should be numeric
- Should not contain any missing data

In [6]:
# Consider highway-mpg for price prediction
df["highway-mpg"]

0      27
1      27
2      26
3      30
4      22
       ..
200    28
201    25
202    23
203    27
204    25
Name: highway-mpg, Length: 205, dtype: int64

In [6]:
# Check missing values
df["highway-mpg"].isnull().sum()

0

In [8]:
# Check for price column
df["Price"].dtype

dtype('O')

In [10]:
# Convert price column to numeric
df["Price"].replace("?", np.nan, inplace = True)
df["Price"] = pd.to_numeric(df["Price"])

In [11]:
# Check for missing values in Price column
df["Price"].isnull().sum()

4

In [12]:
# Drop missing values rows
df.dropna(subset=["Price"], axis=0, inplace = True)

In [13]:
# Check for missing values in Price column
df["Price"].isnull().sum()

0

# Using Scikit Learn Library for Linear Regression

In [14]:
# Import Linear Model from Scikit Learn
from sklearn.linear_model import LinearRegression

In [16]:
# Create a Linear Regression Object
linear_model = LinearRegression()

In [21]:
# Define X as feature set and Y as target variable
X = df[ ["highway-mpg"] ]
Y = df["Price"]

In [22]:
# Model Fit
linear_model.fit(X,Y)

LinearRegression()

In [23]:
print("c_0 = ", linear_model.intercept_)
print("c_1 = ", linear_model.coef_)

c_0 =  38423.305858157386
c_1 =  [-821.73337832]


Predicted_Price = c0 + c1 * highway-mpg
c0 = 38423.305858157386
c1 = -821.73337832

So,
Predicted_Price = 38423.305 - 821.73 * highway-mpg


# Predict Price for Unknown Value of Highway-mpg

In [26]:
# Predict Price of a car for highway-mpg = 30
linear_model.predict(np.array([[30]]))



array([13771.3045085])

# Multiple Linear Regression
- Predict price of a car based on horsepower, curb-weight, engine-size,highway-mpg
- Make sure to check the types of all columns