# Web Scraping from OLX about Car Sales

In [1]:
import pandas as pd
import requests
from time import sleep
from selenium import webdriver
from selenium.common.exceptions import TimeoutException, WebDriverException
from bs4 import BeautifulSoup
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

In [150]:
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}

def scrape_car_data():
    retry_count = 3  # Number of times to retry in case of failure

    for attempt in range(retry_count):
        try:
            # Initialize Selenium with a longer script timeout
            driver = webdriver.Chrome()
            driver.set_script_timeout(40)  # Set the script timeout to 30 seconds

            # Open the target URL
            driver.get("https://www.olx.com.pk/prince-cars_c84?filter=make_eq_prince%2Cmileage_between_1000_to_500000%2Cmodel_eq_prince-1%2Cnew_used_eq_used%2Cpetrol_eq_petrol_and_cng%2Cprice_between_50000_to_5000000%2Cyear_between_2000_to_2024")

            # Scroll down the page until no new items are loaded
            a = 1
            while True:
                try:
                    last_height = driver.execute_script("return document.documentElement.scrollHeight")
                    driver.execute_script("window.scrollTo(0, document.documentElement.scrollHeight);")
                    sleep(40)  # Wait for content to load
                    new_height = driver.execute_script("return document.documentElement.scrollHeight")

                    # Print the current scroll position
                    print(f"{a} --- Scrolled: {new_height - last_height} pixels | Total height: {new_height} pixels")
                    a+=1

                    if last_height == new_height:
                        break
                except TimeoutException as e:
                    print(f"Timed out waiting for page to load: {e}")

            # Get the HTML content before closing the browser window
            html_content = driver.page_source

            # Close the browser window
            driver.quit()

            # Parse the HTML content with BeautifulSoup
            soup = BeautifulSoup(html_content, 'lxml')

            # Extract car data
            card_anchors = soup.find_all('div', class_='_9bea76df')
            anchor_list = []
            for i in card_anchors:
                anchor_tag = i.find("a")
                anchor_list.append(anchor_tag['href'])
            return anchor_list
        
#             if len(anchor_list)==732:
#                 print("")


        except (TimeoutException, WebDriverException) as e:
            print(f"Error: {e}")
            print(f"Retrying ({attempt + 1}/{retry_count}) in 60 seconds...")
            sleep(40)  # Wait for 60 seconds before retrying

    print(f"Failed after {retry_count} attempts. Consider checking your internet connection.")
    return []

# Scrape car data and save to CSV file
card_anchors = scrape_car_data()


1 --- Scrolled: 7944 pixels | Total height: 13458 pixels
2 --- Scrolled: 0 pixels | Total height: 13458 pixels


In [151]:
a = card_anchors

In [152]:
len(card_anchors)

58

In [153]:
card_anchors[0]

'/item/prince-pearl-model-2020-iid-1080767889'

# Save the Links through Pickle

In [2]:
import pickle

pickle.dump(a, open("CollectingData/Daihatsu/Cuore.pkl","wb"))

NameError: name 'a' is not defined

# Try to Fetch 

In [2]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from time import sleep

In [3]:
url = "https://www.olx.com.pk/item/suzuki-wagonr-vxl-ags-2022-model-b-t-b-total-genuine-strachles-7700-km-iid-1080823704"
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}

response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, "lxml")

In [11]:
soup

<!DOCTYPE html>
<html dir="ltr" itemscope="" itemtype="http://schema.org/WebPage" lang="en"><head><meta charset="utf-8"/><meta content="width=device-width, initial-scale=1.0, user-scalable=0" name="viewport"/><link href="https://ll8iz711cs-dsn.algolia.net" rel="dns-prefetch"/><link href="https://www.googletagmanager.com" rel="dns-prefetch"/><link href="https://www.google-analytics.com" rel="dns-prefetch"/><link href="https://images.bayut.com" rel="dns-prefetch"/><link href="/assets/apple-touch-icon.77341dce2e1fd72b9b227dd2ea0d1930.png" rel="apple-touch-icon" sizes="180x180"/><link href="/assets/favicon-16x16.ec663eb59c14f769c42c1a2beed283ca.png" rel="icon" sizes="16x16" type="image/png"/><link href="/assets/favicon-32x32.00d020d6d9a5e92de35779a72f4cc76a.png" rel="icon" sizes="32x32" type="image/png"/><link href="/assets/5dfa2b548d5a01980e8c6eb4f0dd2a2b.json" rel="manifest"/><link color="#28b16d" href="/assets/safari-pinned-tab.5db66542612ff1e8fcf04a09b4dbf4d0.svg" rel="mask-icon"/><met

In [12]:
response

<Response [200]>

### Fetching Necessary Information

#### 1. AD ID

In [13]:
ad_id = soup.find("div", class_="_171225da").text.split(" ")[-1]
ad_id

'1080823704'

#### 2. Price of Car

In [14]:
car_rupees = soup.find("span", class_="_56dab877").text.split(" ")[-1]
car_rupees

'3,150,000'

#### 3. Name of Car

In [15]:
car_name = soup.find("h1", class_="a38b8112").text
car_name

'Suzuki wagonr VXL AGS 2022 model B T B total genuine strachles 7700 km'

#### 4. Location of Seller

In [16]:
div = soup.find("div", class_="_1075545d")
seller_location = div.find("span", class_="_6d5b4928", attrs={"aria-label":"Location"}).text
seller_location

'PAF Road, Sargodha'

#### 5. Car Details

In [41]:
l = ["Make",
     "Model",
     "Year",
     "KM's driven",
     "Price",
     "Fuel",
     "Registration city",
     "Car documents",
     "Assembly",
     "Transmission",
     "Condition"]

details = soup.find_all("div", "b44ca0b3")
for i in details:
    span1 = i.find_all("span")[0].text
    span2 = i.find_all("span")[1].text
    print(f"{span1} : {span2}")
    print(i)

Make : Mercedes
<div class="b44ca0b3"><span>Make</span><span>Mercedes</span></div>
Model : C Class
<div class="b44ca0b3"><span>Model</span><span>C Class</span></div>
Year : 2004
<div class="b44ca0b3"><span>Year</span><span>2004</span></div>
KM's driven : 111,112
<div class="b44ca0b3"><span>KM's driven</span><span>111,112</span></div>
Price : 2,900,000
<div class="b44ca0b3"><span>Price</span><span>2,900,000</span></div>
Fuel : Petrol
<div class="b44ca0b3"><span>Fuel</span><span>Petrol</span></div>
Registration city : Sindh
<div class="b44ca0b3"><span>Registration city</span><span>Sindh</span></div>
Car documents : Original
<div class="b44ca0b3"><span>Car documents</span><span>Original</span></div>
Assembly : Imported
<div class="b44ca0b3"><span>Assembly</span><span>Imported</span></div>
Transmission : Automatic
<div class="b44ca0b3"><span>Transmission</span><span>Automatic</span></div>
Condition : Used
<div class="b44ca0b3"><span>Condition</span><span>Used</span></div>


#### 6. Car Description

In [12]:
div = soup.find("div", class_="_0f86855a")
description = div.find("span").text
description

'Bumper to bumper total genuine\xa0\n\nSuzuki wagonr VXL AGS\xa0\n\nModel 2022\xa0\n\n1.0 cc engine capacity\n\nSilver colour\n\n7700 km original milage\n\nStill smell like a new car . \n\nPunjab register\n\nLooksine like a zero meter car. \n\n(( Extra work on my car all done ))\n\nNew leather seat covers\n\nLCD screen\n\nBack camera\xa0\n\nNew farshi\xa0\n\nAir press (sunshade )\n\nLocation Sargodha city area . \n\nO3O36OO8582'

#### 7. Features of Car

In [13]:
l = []
features = soup.find_all("span", class_="_66b85548")
for i in features:
    span = i.text
    l.append(span)
l

['ABS',
 'Air Bags',
 'Air Conditioning',
 'Alloy Rims',
 'AM/FM Radio',
 'Immobilizer Key',
 'Keyless Entry',
 'Navigation System',
 'Power Locks',
 'Power Windows',
 'Rear speakers',
 'Rear Camera',
 'USB and Auxillary Cable']

#### 8. Images_URL

In [14]:
l = []
img = soup.find_all("img", class_="_5b8e3f79")
for i in img:
    url = i['src']
    l.append(url)
    print(url)

https://images.olx.com.pk/thumbnails/414412614-800x600.jpeg
https://images.olx.com.pk/thumbnails/414412615-800x600.jpeg
https://images.olx.com.pk/thumbnails/414412616-800x600.jpeg
https://images.olx.com.pk/thumbnails/414412617-800x600.jpeg
https://images.olx.com.pk/thumbnails/414412618-800x600.jpeg
https://images.olx.com.pk/thumbnails/414412619-800x600.jpeg
https://images.olx.com.pk/thumbnails/414412620-800x600.jpeg
https://images.olx.com.pk/thumbnails/414412621-800x600.jpeg
https://images.olx.com.pk/thumbnails/414412623-800x600.jpeg
https://images.olx.com.pk/thumbnails/414412625-800x600.jpeg
https://images.olx.com.pk/thumbnails/414412627-800x600.jpeg
https://images.olx.com.pk/thumbnails/414412629-800x600.jpeg
https://images.olx.com.pk/thumbnails/414412631-800x600.jpeg
https://images.olx.com.pk/thumbnails/414412633-800x600.jpeg
https://images.olx.com.pk/thumbnails/414412634-800x600.jpeg
https://images.olx.com.pk/thumbnails/414412635-800x600.jpeg
https://images.olx.com.pk/thumbnails/414

# Now time to Fetch all car details

In [4]:
import pickle

car_anchors = pickle.load(open("CollectingData/Nissan/AD Van.pkl","rb"))


In [5]:
len(car_anchors)

38

In [6]:
car_anchors[0]

'/item/nissan-haice-van-2009-model-iid-1080793404'

In [7]:
car_anchors[-1]

'/item/nissan-ad-20072014-iid-1079766939'

In [26]:
car_anchors1 = car_anchors[205:]
# car_anchors2 = car_anchors[350:504]

In [8]:
def fetch_url(url):
    max_retries = 3
    for _ in range(max_retries):
        try:
            response = requests.get(url, headers=headers)
            response.raise_for_status()  # Raise an HTTPError for bad responses
            return response
        except requests.RequestException as e:
            print(f"Error fetching URL {url}: {e}")
            sleep(3)  # Wait before retrying

    return None

In [None]:
# Create a list to store all car data
all_car_data = []
no = 1
# Iterate through each card anchor
for card_anchor in car_anchors:

    try:
        #     card_link = card_anchor['href']  # Extract the link from the anchor tag
        page_url = f"https://www.olx.com.pk{card_anchor}"
        response = fetch_url(page_url)
        if response is not None:
            soup = BeautifulSoup(response.content, "lxml")

            # All Cars Data
            details = {}

            # AD Id  --------------------->
            ad_id = soup.find("div", class_="_171225da")
            if ad_id is not None:
                details["Ad ID"] = ad_id.text.split(" ")[-1]
            else:
                details["Ad ID"] = None

            # Car Name  ------------------>
            car_name = soup.find("h1", class_="a38b8112")
            if car_name is not None:
                details["Car Name"] = car_name.text
            else:
                details["Car Name"] = None

            # Car Details ---------------->
            common_list = ["Make", "Model", "Year", "KM's driven", "Price", "Fuel", "Registration city", "Car documents", "Assembly", "Transmission", "Condition"]
            car_details = soup.find_all("div", "b44ca0b3")
            # Initialize a dictionary to store details temporarily
            details_temp = {}
            for detail in car_details:
                spans = detail.find_all("span")
                if len(spans) == 2:
                    span1 = spans[0].text.strip()
                    if span1 in common_list:
                        span2 = spans[1].text.strip()
                        details_temp[span1] = span2
            # Update the main details dictionary with temporary details
            for key in common_list:
                if key in details_temp:
                    details[key] = details_temp[key]
                else:
                    details[key] = None

            # Location of Seller ----------->
            div = soup.find("div", class_="_1075545d")
            if div:
                seller_location = div.find("span", class_="_6d5b4928", attrs={"aria-label":"Location"})
                if seller_location is not None:
                    details["Seller Location"] = seller_location.text
                else:
                    details["Seller Location"] = None

            # Price of Car ----------------->
            car_rupees = soup.find("span", class_="_56dab877")
            if car_rupees is not None:
                details["Price (pkr)"] = car_rupees.text.split(" ")[-1]
            else:
                details["Price (pkr)"] = None

            # description of car ---------->
            div = soup.find("div", class_="_0f86855a")
            if div:
                description = div.find("span")
                if description is not None:
                    details["Description"] = description.text
                else:
                    details["Description"] = None

            # features of car ------------->
            l2 = []
            features = soup.find_all("span", class_="_66b85548")
            for i in features:
                span = i.text
                l2.append(span)

            if len(l2)!=0:
                details["Car Features"] = ", ".join(l2)
            else:
                details["Car Features"] = None

            # images_urls
            l3 = []
            img = soup.find_all("img", class_="_5b8e3f79")
            for i in img:
                url = i['src']
                l3.append(url)

            if len(l3)!=0:
                details["Images URL's"] = l3
            else:
                details["Images URL's"] = None

            # Anchors URL
            details["Page URL"] = page_url


            # Append the details to the list
            if any(details.values()):
                all_car_data.append(details)
                print(f"{no} is Done")
                no += 1

                if no==696:
                    break
    except ValueError as e:
        print(f"Error: {e}")
        

# Create a DataFrame from the list
df = pd.DataFrame(all_car_data)

1 is Done
2 is Done
3 is Done
4 is Done
5 is Done
6 is Done
Error fetching URL https://www.olx.com.pk/item/nissan-ad-iid-1080818330: 404 Client Error: Not Found for url: https://www.olx.com.pk/item/nissan-ad-iid-1080818330
Error fetching URL https://www.olx.com.pk/item/nissan-ad-iid-1080818330: 404 Client Error: Not Found for url: https://www.olx.com.pk/item/nissan-ad-iid-1080818330
Error fetching URL https://www.olx.com.pk/item/nissan-ad-iid-1080818330: 404 Client Error: Not Found for url: https://www.olx.com.pk/item/nissan-ad-iid-1080818330
7 is Done
8 is Done
9 is Done
10 is Done
11 is Done
12 is Done
13 is Done
14 is Done
Error fetching URL https://www.olx.com.pk/item/nissan-ad-van-20062013-iid-1076657049: 404 Client Error: Not Found for url: https://www.olx.com.pk/item/nissan-ad-van-20062013-iid-1076657049
Error fetching URL https://www.olx.com.pk/item/nissan-ad-van-20062013-iid-1076657049: 404 Client Error: Not Found for url: https://www.olx.com.pk/item/nissan-ad-van-20062013-iid

In [None]:
df_n = df.copy()

In [23]:
df_n1 = df_n.copy()

In [28]:
df_n2 = df.copy()

In [161]:
df_n3 = df.copy()

In [171]:
df_n2.dropna(inplace=True)

In [25]:
df_n1.shape

(181, 19)

In [29]:
df_n2.shape

(44, 19)

In [162]:
df_n3.shape

(260, 19)

In [30]:
df_n2.isnull().sum()

Ad ID                11
Car Name             11
Make                 11
Model                11
Year                 11
KM's driven          11
Price                11
Fuel                 11
Registration city    14
Car documents        11
Assembly             11
Transmission         11
Condition            11
Seller Location      11
Price (pkr)          11
Description          11
Car Features         11
Images URL's         11
Page URL              0
dtype: int64

In [24]:
df_n1.isnull().sum()

Ad ID                0
Car Name             0
Make                 0
Model                0
Year                 0
KM's driven          0
Price                0
Fuel                 0
Registration city    0
Car documents        0
Assembly             0
Transmission         0
Condition            0
Seller Location      0
Price (pkr)          0
Description          0
Car Features         0
Images URL's         0
Page URL             0
dtype: int64

In [33]:
df_n = pd.concat([df_n1, df_n2], ignore_index=True)

In [34]:
df_n.shape

(214, 19)

In [35]:
df_n.isnull().sum()

Ad ID                0
Car Name             0
Make                 0
Model                0
Year                 0
KM's driven          0
Price                0
Fuel                 0
Registration city    0
Car documents        0
Assembly             0
Transmission         0
Condition            0
Seller Location      0
Price (pkr)          0
Description          0
Car Features         0
Images URL's         0
Page URL             0
dtype: int64

In [70]:
# car_anchors[0:60]

In [20]:
df_n.iloc[240:300,:]
# 205-

Unnamed: 0,Ad ID,Car Name,Make,Model,Year,KM's driven,Price,Fuel,Registration city,Car documents,Assembly,Transmission,Condition,Seller Location,Price (pkr),Description,Car Features,Images URL's,Page URL
240,,,,,,,,,,,,,,,,,,,https://www.olx.com.pk/item/toyota-yaris-13-at...
241,,,,,,,,,,,,,,,,,,,https://www.olx.com.pk/item/good-condition-iid...


In [101]:
df_n = df_n1.copy()

In [31]:
df_n2["Registration city"] = df_n2["Registration city"].fillna(df_n2["Registration city"].mode()[0])

In [32]:
df_n2.dropna(inplace=True)

In [14]:
df_n["Price"] = df_n["Price"].str.replace(",","")
df_n["KM's driven"] = df_n["KM's driven"].str.replace(",","")
l = ["KM's driven","Price","Year"]
for col in l:
    df_n[col] = df_n[col].astype("int")

ValueError: cannot convert float NaN to integer

In [37]:
df_n.drop("Price (pkr)", axis=1, inplace=True)

In [38]:
df_n.dtypes

Ad ID                object
Car Name             object
Make                 object
Model                object
Year                  int32
KM's driven           int32
Price                 int32
Fuel                 object
Registration city    object
Car documents        object
Assembly             object
Transmission         object
Condition            object
Seller Location      object
Description          object
Car Features         object
Images URL's         object
Page URL             object
dtype: object

In [39]:
df_n.head(2)

Unnamed: 0,Ad ID,Car Name,Make,Model,Year,KM's driven,Price,Fuel,Registration city,Car documents,Assembly,Transmission,Condition,Seller Location,Description,Car Features,Images URL's,Page URL
0,1080928834,TOYOTA YARIS /1.3 2021 / ATIV CVT / Registrati...,Toyota,Yaris,2021,41000,4450000,Petrol,Lahore,Original,Local,Automatic,Used,"Ferozepur Road, Lahore",Toyota Yaris ATIV CVT 2021\nModel 2021\nKM's d...,"ABS, Air Bags, Air Conditioning, Alloy Rims, F...",[https://images.olx.com.pk/thumbnails/41504543...,https://www.olx.com.pk/item/toyota-yaris-ativ-...
1,1080901854,Very Good condition,Toyota,Yaris,2021,38000,4150000,Petrol,Lahore,Original,Local,Automatic,Used,"Cavalry Ground, Lahore","untouch new car, local used","Air Bags, Air Conditioning, AM/FM Radio, CD Pl...",[https://images.olx.com.pk/thumbnails/41488235...,https://www.olx.com.pk/item/very-good-conditio...


In [40]:
df_n.tail(22)

Unnamed: 0,Ad ID,Car Name,Make,Model,Year,KM's driven,Price,Fuel,Registration city,Car documents,Assembly,Transmission,Condition,Seller Location,Description,Car Features,Images URL's,Page URL
192,1080230566,Yaris Toyota,Toyota,Yaris,2022,55000,3890000,Petrol,Punjab,Original,Imported,Manual,Used,"Bagarian, Lahore",sim call / 0312/44/74/923\nYaris Toyota Good d...,"ABS, Air Bags, Air Conditioning, AM/FM Radio, ...",[https://images.olx.com.pk/thumbnails/41087314...,https://www.olx.com.pk/item/yaris-toyota-iid-1...
193,1080229345,Toyota Yaris Atvi 0 3 4 2 4 5 0 0 0 1 7,Toyota,Yaris,2022,54000,4055000,Petrol,Karachi,Original,Local,Manual,Used,"Kassowal, Chichawatni",toyota Yaris atvi total orgnal car non accide...,"ABS, Air Bags, Air Conditioning, Alloy Rims, N...",[https://images.olx.com.pk/thumbnails/41086540...,https://www.olx.com.pk/item/toyota-yaris-atvi-...
194,1080223633,Alto Model 2010 fully Genuine,Toyota,Yaris,2010,100000,1000000,Petrol,Sindh,Original,Local,Manual,Used,"Latifabad, Hyderabad",Alto vxr model 2010 full genuine engine suspen...,"Alloy Rims, AM/FM Radio, CD Player",[https://images.olx.com.pk/thumbnails/41082691...,https://www.olx.com.pk/item/alto-model-2010-fu...
195,1080206092,Toyota Yaris 1.5,Toyota,Yaris,2021,52000,4700000,Petrol,Punjab,Original,Local,Automatic,Used,"Chak 82/6R, Sahiwal",Toyota Yaris,"ABS, Air Bags, Air Conditioning, Alloy Rims",[https://images.olx.com.pk/thumbnails/41072738...,https://www.olx.com.pk/item/toyota-yaris-15-ii...
196,1080195872,Toyota yaris Mt 2020,Toyota,Yaris,2020,35000,3750000,Petrol,Karachi,Original,Local,Manual,Used,"Luddan Road, Burewala",35000 drive. bumper to bamper Junein car. neat...,"ABS, Air Bags, Air Conditioning, CD Player, Po...",[https://images.olx.com.pk/thumbnails/41066503...,https://www.olx.com.pk/item/toyota-yaris-mt-20...
197,1080195710,toyota Yaris Gli,Toyota,Yaris,2020,40000,3575000,Petrol,Islamabad,Original,Local,Manual,Used,"Saddar, Rawalpindi",Full urgent sale cash required \nIslamabad num...,"ABS, Air Bags, Air Conditioning, AM/FM Radio, ...",[https://images.olx.com.pk/thumbnails/41066415...,https://www.olx.com.pk/item/toyota-yaris-gli-i...
198,1080189745,Toyota Yaris Ativ 1.5 X 2021,Toyota,Yaris,2021,26000,4275000,Petrol,Punjab,Original,Local,Automatic,Used,"Garden Town, Lahore",Toyota Yaris Ativ 1.5 X 2021\nFull option\nAll...,"ABS, Air Bags, Air Conditioning, Alloy Rims, A...",[https://images.olx.com.pk/thumbnails/41062716...,https://www.olx.com.pk/item/toyota-yaris-ativ-...
199,1080175651,Toyota Yaris Urgent sale,Toyota,Yaris,2020,47000,4400000,Petrol,Islamabad,Original,Local,Automatic,Used,"Adiala Road, Rawalpindi",For Sale: Toyota Yaris 2020\nImmaculate condit...,"ABS, Air Bags, Air Conditioning, Alloy Rims, A...",[https://images.olx.com.pk/thumbnails/41054757...,https://www.olx.com.pk/item/toyota-yaris-iid-1...
200,1080174836,Toyota Yaris Ativ X 1.5 CVT,Toyota,Yaris,2022,23000,5000000,Petrol,Punjab,Original,Local,Automatic,Used,"Royal Orchard, Multan",toyota yaris top of the line variant \nmodel 2...,"ABS, Air Bags, Air Conditioning, Alloy Rims, A...",[https://images.olx.com.pk/thumbnails/41054285...,https://www.olx.com.pk/item/toyota-yaris-ativ-...
201,1080126666,Toyota Yaris ATIV X CVT 1.5,Toyota,Yaris,2020,28000,4550000,Petrol,Islamabad,Original,Local,Automatic,Used,"G-10, Islamabad",bumper to bumper genuine inside out fully orig...,"ABS, Air Bags, Air Conditioning, Alloy Rims, A...",[https://images.olx.com.pk/thumbnails/41025695...,https://www.olx.com.pk/item/toyota-yaris-ativ-...


In [41]:
df_n.shape

(214, 18)

In [42]:
x = df_n.drop("Images URL's", axis=1)
x.duplicated().sum()

25

In [43]:
df_n.to_csv("CollectingDatasets/Toyota/Yaris.csv", index=False)