In [8]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

# Base URL with a placeholder for the page number
base_url = 'https://www.cars24.com/buy-used-car?f=make%3A%3D%3Ahyundai&sort=bestmatch&serveWarrantyCount=true&search=Hyundai&listingSource=Search_HP&storeCityId=2378'


# Function to fetch car details from a single page
def fetch_page(page_number):
    url = base_url.format(page_number)
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    results = soup.find_all('div', {'class': '_7jb8Q _1Ey60'})  # Adjust this selector based on actual HTML
    cars = []
    for result in results:
        car_details = {}
        try:
            car_details['Title'] = result.find('h3').get_text(strip=True)
        except AttributeError:
            car_details['Title'] = None
        try:
            car_details['Details'] = result.find('ul', {'class': '_3jRcd'}).get_text(strip=True)
        except AttributeError:
            car_details['Price'] = None
        try:
            car_details['Price'] = result.find('strong', {'class': '_37WXy'}).get_text(strip=True)
        except AttributeError:
            car_details['Location'] = None
        try:
            car_details['Location'] = result.find('p', {'class': '_2rxhF'}).get_text(strip=True)
        except AttributeError:
            car_details['Location'] = None
        cars.append(car_details)
    return cars

# Fetch car details from multiple pages
all_cars = []
for page_number in range(1, 6):  # Adjust range as needed
    page_cars = fetch_page(page_number)
    all_cars.extend(page_cars)

# Convert the list of cars to a DataFrame
df = pd.DataFrame(all_cars)

# Display the first few rows of the DataFrame
print(df.head())


                                     Title  \
0       2019 Hyundai Elite i20ASTA 1.2 (O)   
1  2016 Hyundai CretaSX PLUS AT 1.6 DIESEL   
2             2016 Hyundai i20 Active1.2 S   
3       2022 Hyundai NEW I20Sportz 1.2 IVT   
4       2021 Hyundai VENUESX 1.0 (O) TURBO   

                                  Details   Price  \
0       Top Model41,049 kmPetrol2nd owner  ₹5.80L   
1       Top Model33,417 kmDiesel2nd owner  ₹8.25L   
2    Alloy wheels66,357 kmPetrol2nd owner  ₹4.65L   
3    Reg. serviced8,330 kmPetrol1st owner  ₹8.76L   
4  Fancy reg. no.23,955 kmPetrol2nd owner  ₹8.87L   

                                       Location  
0     Free Test DriveTomorrowatGoregaon, Mumbai  
1     Free Test DriveTomorrowatGoregaon, Mumbai  
2  Free Test DriveTomorrowatMulund West, Mumbai  
3     Free Test DriveTomorrowatGoregaon, Mumbai  
4              Free Test DriveTomorrowatSeawood  


In [9]:
df.shape

(100, 4)

In [10]:
Hyundai_cars = df[df['Title'].str.contains('Hyundai', case=False, na=False)]

# Display the filtered DataFrame
print(Hyundai_cars.head())



                                     Title  \
0       2019 Hyundai Elite i20ASTA 1.2 (O)   
1  2016 Hyundai CretaSX PLUS AT 1.6 DIESEL   
2             2016 Hyundai i20 Active1.2 S   
3       2022 Hyundai NEW I20Sportz 1.2 IVT   
4       2021 Hyundai VENUESX 1.0 (O) TURBO   

                                  Details   Price  \
0       Top Model41,049 kmPetrol2nd owner  ₹5.80L   
1       Top Model33,417 kmDiesel2nd owner  ₹8.25L   
2    Alloy wheels66,357 kmPetrol2nd owner  ₹4.65L   
3    Reg. serviced8,330 kmPetrol1st owner  ₹8.76L   
4  Fancy reg. no.23,955 kmPetrol2nd owner  ₹8.87L   

                                       Location  
0     Free Test DriveTomorrowatGoregaon, Mumbai  
1     Free Test DriveTomorrowatGoregaon, Mumbai  
2  Free Test DriveTomorrowatMulund West, Mumbai  
3     Free Test DriveTomorrowatGoregaon, Mumbai  
4              Free Test DriveTomorrowatSeawood  


In [11]:
Hyundai_cars.to_csv('Hyundai_cars.csv', index=False)

# Optionally, display a message confirming that the file has been saved
print("Hyundai cars data has been saved to 'Hyundai_cars.csv'.")

Hyundai cars data has been saved to 'Hyundai_cars.csv'.


In [12]:
df = pd.read_csv('Hyundai_cars.csv')
df

Unnamed: 0,Title,Details,Price,Location
0,2019 Hyundai Elite i20ASTA 1.2 (O),"Top Model41,049 kmPetrol2nd owner",₹5.80L,"Free Test DriveTomorrowatGoregaon, Mumbai"
1,2016 Hyundai CretaSX PLUS AT 1.6 DIESEL,"Top Model33,417 kmDiesel2nd owner",₹8.25L,"Free Test DriveTomorrowatGoregaon, Mumbai"
2,2016 Hyundai i20 Active1.2 S,"Alloy wheels66,357 kmPetrol2nd owner",₹4.65L,"Free Test DriveTomorrowatMulund West, Mumbai"
3,2022 Hyundai NEW I20Sportz 1.2 IVT,"Reg. serviced8,330 kmPetrol1st owner",₹8.76L,"Free Test DriveTomorrowatGoregaon, Mumbai"
4,2021 Hyundai VENUESX 1.0 (O) TURBO,"Fancy reg. no.23,955 kmPetrol2nd owner",₹8.87L,Free Test DriveTomorrowatSeawood
...,...,...,...,...
95,2017 Hyundai Elite i20SPORTZ 1.2,"Safety specs.19,288 kmPetrol1st owner",₹5.34L,"Free Test DriveTomorrowatGoregaon, Mumbai"
96,2015 Hyundai i20 Active1.4 SX,"Top Model77,631 kmDiesel2nd owner",₹5.01L,"Free Test DriveTomorrowatGoregaon, Mumbai"
97,2016 Hyundai CretaSX PLUS 1.6 PETROL,"Top Model71,684 kmPetrol1st owner",₹6.91L,"Free Test DriveTomorrowatGoregaon, Mumbai"
98,2015 Hyundai XcentS 1.2,"City driven65,269 kmPetrol2nd owner",₹3.38L,"Free Test DriveTomorrowatGoregaon, Mumbai"


In [13]:
df.head(10)

import pandas as pd

# Define the path to your CSV file
file_path = 'Hyundai_cars.csv'

# Define the columns to use
cols = ['Title', 'Details', ]

# Function to process each chunk
def process_chunk(chunk):
    # Extract Year of Manufacture (from Title)
    chunk['Year'] = chunk['Title'].str.extract(r'(\d{4})')

    # Extract Kilometers Driven
    chunk['Kilometers Driven'] = chunk['Details'].str.extract(r'(\d{1,3}(?:,\d{3})*)\s*km')

    # Extract Fuel Type
    chunk['Fuel Type'] = chunk['Details'].str.extract(r'(Petrol|Diesel)')

    # Extract Owner
    chunk['Owner'] = chunk['Details'].str.extract(r'(\d{1,2}st owner|1st owner|2nd owner|3rd owner)')
    

    # Remove the Year from the 'Title' column and rename it to 'Name'
    chunk['Title'] = chunk['Title'].str.replace(r'^\d{4}\s*', '', regex=True).str.strip()
    chunk.rename(columns={'Title': 'Name'}, inplace=True)

    return chunk

# Initialize an empty list to hold processed chunks
processed_chunks = []

# Process the CSV file in chunks
for chunk in pd.read_csv(file_path, usecols=cols, chunksize=10000):
    processed_chunk = process_chunk(chunk)
    processed_chunks.append(processed_chunk)

# Concatenate all chunks into a single DataFrame
final_df = pd.concat(processed_chunks, ignore_index=True)

# Save the final DataFrame to a CSV file
final_df.to_csv('Hyundai_cars_data.csv', index=False)

print("Data processing complete and saved to 'Hyundai_cars_data.csv'.")



Data processing complete and saved to 'Hyundai_cars_data.csv'.
