In this project, we would learn how to train a model in order to predict car prices with Machine Learning Using Pytorch.

# What is PyTorch?
PyTorch is a library in Python which provides tools to build deep learning models.

# Import the standard library


In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset, random_split
# import jovian

# Read the data

In [4]:
df=pd.read_csv(r'https://raw.githubusercontent.com/amankharwal/Website-data/master/car%20data.csv')
df.head()

Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
0,ritz,2014,3.35,5.59,27000,Petrol,Dealer,Manual,0
1,sx4,2013,4.75,9.54,43000,Diesel,Dealer,Manual,0
2,ciaz,2017,7.25,9.85,6900,Petrol,Dealer,Manual,0
3,wagon r,2011,2.85,4.15,5200,Petrol,Dealer,Manual,0
4,swift,2014,4.6,6.87,42450,Diesel,Dealer,Manual,0


# Data Cleaning
Before using the data, we need to do some cleaning, especially for string since they will consume too much time for analysis in the future.

In [6]:
standard = "standard str" # at least 5 characters
def customize_dataset(df, rand_str):
    dataframe = df.copy()
    # drop some rows
    dataframe = dataframe.sample(int(0.95*len(dataframe)), 
                                 random_state=int(ord(rand_str[0])))
    # scale input
    dataframe.Year = dataframe.Year * ord(rand_str[1])/100.
    # scale target
    dataframe.Selling_Price = dataframe.Selling_Price * ord(rand_str[2])/100.
    # drop column
    if ord(rand_str[3]) % 2 == 1:
        dataframe = dataframe.drop(['Car_Name'], axis=1)
    return dataframe

dataframe = customize_dataset(df, standard)
dataframe.head()

Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
288,city,2337.4,8.148,13.6,34000,Petrol,Dealer,Manual,0
21,ignis,2339.72,4.753,5.71,2400,Petrol,Dealer,Manual,0
226,grand i10,2337.4,5.0925,5.7,24678,Petrol,Dealer,Manual,0
264,amaze,2337.4,3.88,7.0,40026,Petrol,Dealer,Manual,0
156,TVS Sport,2339.72,0.4656,0.52,15000,Petrol,Individual,Manual,0


From function above, we use the variable **'standard'** to sort data ramdomly. After that, we can use the custom dataset, which means we can create variables containing the numbers of rows, columns and variables containing the numeric, categorial or output columns.

In [7]:
inputCols = ["Year","Present_Price","Kms_Driven","Owner"]
categoricalCols = ["Fuel_Type","Seller_Type","Transmission"]
outputCols = ["Selling_Price"]

# Data Preparation
For use the data for training, we need to convert if from DataFrame to PyTorch Tensors.

In [9]:
def dataframe_to_arrays(dataframe):
    # Make a copy of the original dataframe
    dataframe1 = dataframe.copy(deep=True)
    # Convert non-numeric categorical columns to numbers
    for col in categoricalCols:
        dataframe1[col] = dataframe1[col].astype('category').cat.codes
    # Extract input & outupts as numpy arrays
    inputs_array = dataframe1[inputCols].to_numpy()
    targets_array = dataframe1[outputCols].to_numpy()
    return inputs_array, targets_array

inputs_array, targets_array = dataframe_to_arrays(dataframe)

The function above converts the input and output columns to NumPy arrays. Now our task is to create code that performs conversion from NumPy arrays to PyTorch Tensors.

In [10]:
inputs = torch.Tensor(inputs_array)
targets = torch.Tensor(targets_array)

dataset = TensorDataset(inputs, targets)
trainDs, valDs = random_split(dataset, [228, 57])
batchSize = 128

trainLoader = DataLoader(trainDs, batchSize, shuffle=True)
valLoader = DataLoader(valDs, batchSize)

Now, we are going to create a linear regression model using PyTorch to predict car prices.

In [11]:
input_size = len(inputCols)
output_size = len(outputCols)

class CarsModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear = nn.Linear(input_size, output_size)  # fill this 
        # (hint: use input_size & output_size defined above)
        
    def forward(self, xb):
        out = self.linear(xb)                          # fill this
        return out
    
    def training_step(self, batch):
        inputs, targets = batch 
        # Generate predictions
        out = self(inputs)          
        # Calcuate loss
        loss = F.l1_loss(out, targets)                         # fill this
        return loss
    
    def validation_step(self, batch):
        inputs, targets = batch
        # Generate predictions
        out = self(inputs)
        # Calculate loss
        loss = F.l1_loss(out, targets)                           # fill this    
        return {'val_loss': loss.detach()}
        
    def validation_epoch_end(self, outputs):
        batch_losses = [x['val_loss'] for x in outputs]
        epoch_loss = torch.stack(batch_losses).mean()   # Combine losses
        return {'val_loss': epoch_loss.item()}
    
    def epoch_end(self, epoch, result, num_epochs):
        # Print result every 20th epoch
        if (epoch+1) % 20 == 0 or epoch == num_epochs-1:
            print("Epoch [{}], val_loss: {:.4f}".format(epoch+1, result['val_loss']))
            
model = CarsModel()
list(model.parameters())

[Parameter containing:
 tensor([[-0.1735,  0.1892,  0.0218,  0.0214]], requires_grad=True),
 Parameter containing:
 tensor([0.1346], requires_grad=True)]

# Training Model to Predict Car Prices

In [12]:

# Eval algorithm
def evaluate(model, valLoader):
    outputs = [model.validation_step(batch) for batch in valLoader]
    return model.validation_epoch_end(outputs)

# Fitting algorithm
def fit(epochs, lr, model, trainLoader, valLoader, opt_func=torch.optim.SGD):
    history = []
    optimizer = opt_func(model.parameters(), lr)
    for epoch in range(epochs):
        # Training Phase 
        for batch in trainLoader:
            loss = model.training_step(batch)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
        # Validation phase
        result = evaluate(model, valLoader)
        model.epoch_end(epoch, result, epochs)
        history.append(result)
    return history

# Check the initial value that val_loss have
result = evaluate(model, valLoader)
result

{'val_loss': 491.13665771484375}

In [14]:
# Start with the Fitting
epochs = 90
lr = 1e-8
history1 = fit(epochs, lr, model, trainLoader, valLoader)

Epoch [20], val_loss: 224.0631
Epoch [40], val_loss: 207.5535
Epoch [60], val_loss: 207.7427
Epoch [80], val_loss: 207.0801
Epoch [90], val_loss: 206.9049


In [15]:
# Train repeatdly until have a 'good' val_loss
epochs = 20
lr = 1e-9
history1 = fit(epochs, lr, model, trainLoader, valLoader)

Epoch [20], val_loss: 206.8093
