# A Machine Learning Program to predict price of used cars for sale
This is a project to fairly used car information and their prices and use the dataset to train a machine learning model that will be used to predict the price of a new fairly used car listed.

## Importing Libraries

In [1]:
import requests
from bs4 import BeautifulSoup
import re
import os
from datetime import datetime
import numpy as np
import pandas as pd
import glob

## Creating a function to get and parse a url

In [2]:
def getAndParseURL(url):
    result = requests.get(url)
    soup = BeautifulSoup(result.text, 'html.parser')
    return(soup)

In [25]:
carUrls = []
for page in range(0,629):
    main_url = "https://www.cheki.com.ng/vehicles?page="+str(page)
    
    soup = getAndParseURL(main_url)
    carlink = [x.a.get('href') for x in soup.findAll("div", class_ = "card-main")]
    #car = soup.find_all("div", class_ = "card-main")
    for i in carlink:
        carurl = "https://www.cheki.com.ng" + i
        carUrls.append(carurl)

In [2]:
current_dir = os.path.abspath(os.curdir)
os.chdir("../..")
main_dir = os.path.abspath(os.curdir)

In [3]:
# File Paths
raw_data_path = os.path.join(main_dir,'data','raw')
#raw_data_path
car_data_path = os.path.join(raw_data_path, 'car_data.csv')
car_data_path

'/home/danselem/Desktop/opensource/car-price-prediction/car-price-prediction/data/raw/used_car_data.csv'

In [13]:
carUrls[6]

'https://www.cheki.com.ng/vehicle/lexus-gx-460-5g8zen'

## What we need to get
The below script was used in fetching car features: name, brand, price, mileage, condition, body type, color, drive type, fuel, drive setup, transmision, interior type, door count and year from each car respective page.

It was used in September, 2020 to scrape car informations from cheki.com.ng, a Nigeria car listing website now autocheck.africa/ng

In [None]:
df = pd.DataFrame(columns = ['year',
            'brand', 'model', 'mileage', 'condition',
            'body_type', 'color', 'drive_type', 'fuel',
            'drive_setup',
            'transmission', 'interior_type', 'door_count',
            'star', 'review_no', 'date_posted', 'price'])

for car in carUrls:
    # Fetch the car brand and model
    soup = getAndParseURL(car)
    try:
      soup.find('div', {'class': 'bread-crumbs'}).findAll('li')
    except AttributeError:
      brand = 'NA'
      model = 'NA'
    else:
      if soup.find('div', {'class': 'bread-crumbs'}).findAll('li') == None:
        brand = 'NA'
        model = 'NA'
      else:
        brand_model = soup.find('div', {'class': 'bread-crumbs'}).findAll('li')
        try:
          brand_model[2].text
        except IndexError:
          brand = 'NA'
        else:
          if brand_model[2].text == None:
            brand = 'NA'
          else:
            brand = brand_model[2].text
        try:
          brand_model[3].text
        except IndexError:
          model = 'NA'
        else:
          if brand_model[3].text == None:
            model = 'NA'
          else:
            model = brand_model[3].text
            #print(brand, model)
    
    # Fetch the car price
    try:
      soup.find('div', class_ = "listing-detail__price").span.text[2:]
    except AttributeError:
      price = np.NaN
    else:
      if soup.find('div', class_ = "listing-detail__price").span.text[2:] == None:
        price = np.NaN
      else:
        price = soup.find('div', class_ = "listing-detail__price").span.text[2:]
        price = price.strip()
        price = re.sub('[,]', '', price)
        #type(price)
        price = int(price)
        #print(type(price))
    
    # Fetch other car feature
    info = soup.find('div', {'class':'listing-detail__attributes'})
    # Mileage
    try:
      info.find("dt", text="Mileage (km)")
    except AttributeError:
      mileage = np.NaN
    else:
      if info.find("dt", text="Mileage (km)") == None:
        mileage = np.NaN
      else:
        mileage = info.find("dt",text="Mileage (km)").findNext("dd").string
        #print(mileage)
    
    # Fetch car condition
    try:
      info.find("dt",text="Condition")
    except AttributeError:
      cond = 'NA'
    else:
      if info.find("dt",text="Condition") == None:
        cond = 'NA'
      else:
        cond = info.find("dt",text="Condition").findNext("dd").string
    #print(cond)
    # Fetch body type
    try:
      info.find("dt",text="Body Type")
    except AttributeError:
      b_type = 'NA'
    else:
      if info.find("dt",text="Body Type") == None:
        b_type = 'NA'
      else:
        b_type = info.find("dt",text="Body Type").findNext("dd").string
    #print(b_type)
    # Fetch color
    try:
      info.find("dt",text="Colour")
    except AttributeError:
      color = 'NA'
    else:
      if info.find("dt",text="Colour") == None:
        color = 'NA'
      else:
        color = info.find("dt",text="Colour").findNext("dd").string
    #print(color)
    # Fetch drive type
    try:
      info.find("dt",text="Drive Type")
    except AttributeError:
      d_type = 'NA'
    else:
      if info.find("dt",text="Drive Type") == None:
        d_type = 'NA'
      else:
        d_type = info.find("dt",text="Drive Type").findNext("dd").string
        #print(d_type)
    # Fetch Fuel type
    try:
      info.find("dt",text="Fuel")
    except AttributeError:
      fuel = 'NA'
    else:
      if info.find("dt",text="Fuel") == None:
        fuel = 'NA'
      else:
        fuel = info.find("dt",text="Fuel").findNext("dd").string
    #print(fuel)
    # Fetch drive setup
    try:
      info.find("dt",text="Drive Setup")
    except AttributeError:
      d_setup = 'NA'
    else:
      if info.find("dt",text="Drive Setup") == None:
        d_setup = 'NA'
      else:
        d_setup = info.find("dt",text="Drive Setup").findNext("dd").string
    #print(d_setup)
    # Fetch transmission
    try:
      info.find("dt",text="Transmission")
    except AttributeError:
      trans = 'NA'
    else:
      if info.find("dt",text="Transmission") == None:
        trans = 'NA'
      else:
        trans = info.find("dt",text="Transmission").findNext("dd").string
    #print(trans)
    # Fetch interior type
    try:
      info.find("dt",text="Interior Type")
    except AttributeError:
      i_type = 'NA'
    else:
      if info.find("dt",text="Interior Type") == None:
        i_type = 'NA'
      else:
        i_type = info.find("dt",text="Interior Type").findNext("dd").string
    #print(i_type)
    # Fetch door count
    try:
      info.find("dt",text="Door Count")
    except AttributeError:
      d_count = 'NA'
    else:
      if info.find("dt",text="Door Count") == None:
        d_count = 'NA'
      else:
        d_count = info.find("dt",text="Door Count").findNext("dd").string
    # Fetch production year
    try:
      info.find("dt",text="Year").findNext("dd")
    except AttributeError:
      year = np.NaN
    else:
      year = info.find("dt",text="Year").findNext("dd").string
      year = int(year)
    #print(year)
    #print(type(year))
    try:
      soup.find("meta", attrs={'name':'sailthru.date'})
    except TypeError:
      date = 'NA'
    else:
      if soup.find("meta", attrs={'name':'sailthru.date'}) == None:
        date = 'NA'
      else:
        date = soup.find("meta", attrs={'name':'sailthru.date'}) #name = "sailthru.date"
        date = date["content"]
        date = datetime.strptime(date, '%Y-%m-%d').date()
    
    # Check for star ratings and review
    try:
      soup.find('div', {'class':'review-stars-block'})
    except AttributeError:
      star_no = 0
      review_no = 0
    else:
      if soup.find('div', {'class':'review-stars-block'}) == None:
        star_no = 0
        review_no = 0
      else:
        review_list = soup.find('div', {'class':'review-stars-block'}).span.text
        review = review_list.split()
        star_no = review[0]
        star_no = int(star_no)
        review_no = re.sub('[()]', '', review[1])
        review_no = int(review_no)
        
    # Entering crawled data into a data frame
    data = {
            'year' : year, 
            'brand': brand.lower(),
            'model': model.lower(),
            'mileage' : mileage,
            'condition' : cond.lower(),
            'body_type' : b_type.lower(),
            'color' : color.lower(),
            'drive_type' : d_type.lower(),
            'fuel' : fuel.lower(),
            'drive_setup' : d_setup.lower(),
            'transmission' : trans.lower(),
            'interior_type' : i_type.lower(),
            'door_count' : d_count,
            'star': star_no,
            'review_no' : review_no,
            'date_posted' : date,
            'price': price}
    df.loc[len(df)] = data
    df.to_csv(car_data_path, index=False)

## Importing Data

In [4]:
car_data = pd.read_csv(car_data_path)

In [12]:
type(car_data)

pandas.core.frame.DataFrame

## Basic Structure

In [13]:
car_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11864 entries, 0 to 11863
Data columns (total 17 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   year           11792 non-null  float64
 1   brand          11864 non-null  object 
 2   model          11864 non-null  object 
 3   mileage        6878 non-null   float64
 4   condition      11864 non-null  object 
 5   body_type      11864 non-null  object 
 6   color          11864 non-null  object 
 7   drive_type     11864 non-null  object 
 8   fuel           11864 non-null  object 
 9   drive_setup    11864 non-null  object 
 10  transmission   11864 non-null  object 
 11  interior_type  11864 non-null  object 
 12  door_count     11677 non-null  object 
 13  star           11864 non-null  int64  
 14  review_no      11864 non-null  int64  
 15  date_posted    11793 non-null  object 
 16  price          11793 non-null  float64
dtypes: float64(3), int64(2), object(12)
memory usage: 

In [15]:
car_data.head(10)

Unnamed: 0,year,brand,model,mileage,condition,body_type,color,drive_type,fuel,drive_setup,transmission,interior_type,door_count,star,review_no,date_posted,price
0,2014.0,mercedes-benz,s550,26700.0,foreign used,saloons,silver,4 wheel drive,petrol,lefthand drive,automatic,leather,4,0,0,2020-07-16,27000000.0
1,2016.0,bmw,x5,53000.0,foreign used,4 wheel drives & suvs,white,4 wheel drive,petrol,lefthand drive,automatic,leather,4,0,0,2020-06-14,23000000.0
2,2019.0,land-rover,range-rover-vogue,4300.0,foreign used,4 wheel drives & suvs,black,4 wheel drive,petrol,lefthand drive,automatic,leather,4,0,0,2020-06-18,120000000.0
3,2011.0,lexus,gx,82500.0,foreign used,4 wheel drives & suvs,black,4 wheel drive,petrol,lefthand drive,automatic,leather,4,4,1,2020-07-18,12000000.0
4,2018.0,land-rover,range-rover-vogue,22500.0,foreign used,4 wheel drives & suvs,black,4 wheel drive,petrol,lefthand drive,automatic,leather,4,0,0,2020-07-18,50000000.0
5,2014.0,land-rover,range-rover-sport,23000.0,foreign used,4 wheel drives & suvs,gold,4 wheel drive,petrol,lefthand drive,automatic,leather,4,4,1,2020-07-16,21000000.0
6,2020.0,land-rover,range-rover-vogue,150.0,brand new,4 wheel drives & suvs,black,4 wheel drive,petrol,lefthand drive,automatic,leather,4,0,0,2020-06-11,160000000.0
7,2018.0,mercedes-benz,gle-43,7900.0,foreign used,4 wheel drives & suvs,white,4 wheel drive,petrol,lefthand drive,automatic,leather,4,0,0,2020-06-09,34000000.0
8,2018.0,lexus,lx,16800.0,foreign used,4 wheel drives & suvs,white,4 wheel drive,petrol,lefthand drive,automatic,leather,4,5,2,2020-06-11,51000000.0
9,2014.0,land-rover,range-rover-sport,26800.0,foreign used,4 wheel drives & suvs,white,4 wheel drive,petrol,lefthand drive,automatic,leather,4,4,1,2020-06-14,25000000.0
