In [2]:

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics


In [3]:
car_data = pd.read_csv("data/cardata.csv")


In [4]:
car_data.head()


Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
0,ritz,2014,3.35,5.59,27000,Petrol,Dealer,Manual,0
1,sx4,2013,4.75,9.54,43000,Diesel,Dealer,Manual,0
2,ciaz,2017,7.25,9.85,6900,Petrol,Dealer,Manual,0
3,wagon r,2011,2.85,4.15,5200,Petrol,Dealer,Manual,0
4,swift,2014,4.6,6.87,42450,Diesel,Dealer,Manual,0


In [5]:
car_data.shape


(301, 9)

In [5]:
car_data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 301 entries, 0 to 300
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Car_Name       301 non-null    object 
 1   Year           301 non-null    int64  
 2   Selling_Price  301 non-null    float64
 3   Present_Price  301 non-null    float64
 4   Kms_Driven     301 non-null    int64  
 5   Fuel_Type      301 non-null    object 
 6   Seller_Type    301 non-null    object 
 7   Transmission   301 non-null    object 
 8   Owner          301 non-null    int64  
dtypes: float64(2), int64(3), object(4)
memory usage: 21.3+ KB


In [6]:
# encoding Columns
car_data.replace(
    {'Fuel_Type': {'Petrol': 0, 'Diesel': 1, 'CNG': 2}}, inplace=True)
car_data.replace({'Seller_Type': {'Dealer': 0, 'Individual': 1}}, inplace=True)
car_data.replace({'Transmission': {'Manual': 0, 'Automatic': 1}}, inplace=True)


In [7]:
X = car_data.drop(['Car_Name', 'Selling_Price'], axis=1)
Y = car_data['Selling_Price']
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.2, random_state=42)


In [8]:
X_train.head()
X_train['Owner'].unique()


array([1, 0, 3])

In [9]:
# loading the linear regression model
lin_reg_model = LinearRegression()

lin_reg_model.fit(X_train, Y_train)


In [10]:
# prediction on Training data
training_data_prediction = lin_reg_model.predict(X_train)


In [11]:
# R squared Error
train_error_score = metrics.r2_score(Y_train, training_data_prediction)
print("R squared Error - Training : ", train_error_score)


R squared Error - Training :  0.883979349675079


In [12]:
# prediction on Training data
Y_pred = lin_reg_model.predict(X_test)


In [13]:
# R squared Error
test_error_score = metrics.r2_score(Y_test, Y_pred)
print("R squared Error - Test: ", test_error_score)


R squared Error - Test:  0.8468053957655798


In [31]:
import pickle
from sklearn.linear_model import LinearRegression


# Saving the model to a file
filename = 'lin_reg_model.pkl'
with open(filename, 'wb') as file:
    pickle.dump(lin_reg_model, file)


In [14]:
lin_reg_model.predict([[2014, 5.9, 27000, 1, 0, 0, 0]])




array([5.42407139])

In [15]:
# {
#   "Year": 2014,
#   "Present_Price": 5.9,
#   "Kms_Driven": 27000,
#   "Fuel_Type": 1,
#   "Seller_Type": 0,
#   "Transmission": 0,
#   "Owner": 0
# }
X_train.info()


<class 'pandas.core.frame.DataFrame'>
Index: 240 entries, 184 to 102
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Year           240 non-null    int64  
 1   Present_Price  240 non-null    float64
 2   Kms_Driven     240 non-null    int64  
 3   Fuel_Type      240 non-null    int64  
 4   Seller_Type    240 non-null    int64  
 5   Transmission   240 non-null    int64  
 6   Owner          240 non-null    int64  
dtypes: float64(1), int64(6)
memory usage: 15.0 KB


In [17]:
mydata = {
    "Year": [2014],
    "Present_Price": [5.9],
    "Kms_Driven": [27000],
    "Fuel_Type": [1],
    "Seller_Type": [0],
    "Transmission": [0],
    "Owner": [0]
}
mydf = pd.DataFrame(mydata)
mydf


Unnamed: 0,Year,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
0,2014,5.9,27000,1,0,0,0


In [18]:
lin_reg_model.predict(mydf)


array([5.42407139])