In [8]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

# Base URL with a placeholder for the page number
base_url = 'https://www.cars24.com/buy-used-car?f=make%3A%3D%3Akia&sort=bestmatch&serveWarrantyCount=true&listingSource=Search_LP&storeCityId=2378'


# Function to fetch car details from a single page
def fetch_page(page_number):
    url = base_url.format(page_number)
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    results = soup.find_all('div', {'class': '_7jb8Q _1Ey60'})  # Adjust this selector based on actual HTML
    cars = []
    for result in results:
        car_details = {}
        try:
            car_details['Title'] = result.find('h3').get_text(strip=True)
        except AttributeError:
            car_details['Title'] = None
        try:
            car_details['Details'] = result.find('ul', {'class': '_3jRcd'}).get_text(strip=True)
        except AttributeError:
            car_details['Price'] = None
        try:
            car_details['Price'] = result.find('strong', {'class': '_37WXy'}).get_text(strip=True)
        except AttributeError:
            car_details['Location'] = None
        try:
            car_details['Location'] = result.find('p', {'class': '_2rxhF'}).get_text(strip=True)
        except AttributeError:
            car_details['Location'] = None
        cars.append(car_details)
    return cars

# Fetch car details from multiple pages
all_cars = []
for page_number in range(1, 6):  # Adjust range as needed
    page_cars = fetch_page(page_number)
    all_cars.extend(page_cars)

# Convert the list of cars to a DataFrame
df = pd.DataFrame(all_cars)

# Display the first few rows of the DataFrame
print(df.head())


                            Title                                Details  \
0      2022 KIA SELTOSGTX (O) 1.4  Reg. serviced23,252 kmPetrol1st owner   
1  2020 KIA SONETGTX PLUS 1.0 IMT        Sunroof35,415 kmPetrol1st owner   
2      2022 KIA SONETGTX PLUS 1.5  Reg. serviced28,326 kmDiesel1st owner   
3     2020 KIA SELTOSHTK PLUS 1.5   Alloy wheels48,880 kmPetrol1st owner   
4     2020 KIA SELTOSHTK PLUS 1.5   Alloy wheels15,160 kmPetrol1st owner   

     Price                                      Location  
0  ₹14.60L     Free Test DriveTomorrowatGoregaon, Mumbai  
1   ₹9.94L     Free Test DriveTomorrowatGoregaon, Mumbai  
2  ₹11.99L     Free Test DriveTomorrowatGoregaon, Mumbai  
3   ₹9.90L  Free Test DriveTomorrowatMulund West, Mumbai  
4  ₹10.79L     Free Test DriveTomorrowatGoregaon, Mumbai  


In [9]:
df.shape

(50, 4)

In [10]:
Kia_cars = df[df['Title'].str.contains('KIA', case=False, na=False)]

# Display the filtered DataFrame
print(Kia_cars.head())



                            Title                                Details  \
0      2022 KIA SELTOSGTX (O) 1.4  Reg. serviced23,252 kmPetrol1st owner   
1  2020 KIA SONETGTX PLUS 1.0 IMT        Sunroof35,415 kmPetrol1st owner   
2      2022 KIA SONETGTX PLUS 1.5  Reg. serviced28,326 kmDiesel1st owner   
3     2020 KIA SELTOSHTK PLUS 1.5   Alloy wheels48,880 kmPetrol1st owner   
4     2020 KIA SELTOSHTK PLUS 1.5   Alloy wheels15,160 kmPetrol1st owner   

     Price                                      Location  
0  ₹14.60L     Free Test DriveTomorrowatGoregaon, Mumbai  
1   ₹9.94L     Free Test DriveTomorrowatGoregaon, Mumbai  
2  ₹11.99L     Free Test DriveTomorrowatGoregaon, Mumbai  
3   ₹9.90L  Free Test DriveTomorrowatMulund West, Mumbai  
4  ₹10.79L     Free Test DriveTomorrowatGoregaon, Mumbai  


In [11]:
Kia_cars.to_csv('Kia_cars.csv', index=False)

# Optionally, display a message confirming that the file has been saved
print("Kia cars data has been saved to 'Kia_cars.csv'.")

Kia cars data has been saved to 'Kia_cars.csv'.


In [12]:
df = pd.read_csv('Kia_cars.csv')
df

Unnamed: 0,Title,Details,Price,Location
0,2022 KIA SELTOSGTX (O) 1.4,"Reg. serviced23,252 kmPetrol1st owner",₹14.60L,"Free Test DriveTomorrowatGoregaon, Mumbai"
1,2020 KIA SONETGTX PLUS 1.0 IMT,"Sunroof35,415 kmPetrol1st owner",₹9.94L,"Free Test DriveTomorrowatGoregaon, Mumbai"
2,2022 KIA SONETGTX PLUS 1.5,"Reg. serviced28,326 kmDiesel1st owner",₹11.99L,"Free Test DriveTomorrowatGoregaon, Mumbai"
3,2020 KIA SELTOSHTK PLUS 1.5,"Alloy wheels48,880 kmPetrol1st owner",₹9.90L,"Free Test DriveTomorrowatMulund West, Mumbai"
4,2020 KIA SELTOSHTK PLUS 1.5,"Alloy wheels15,160 kmPetrol1st owner",₹10.79L,"Free Test DriveTomorrowatGoregaon, Mumbai"
5,2020 KIA SELTOSHTX PLUS AT1.5 DIESEL,"Top Model45,979 kmDiesel1st owner",₹13.79L,"Free Test DriveTomorrowatGoregaon, Mumbai"
6,2020 KIA SELTOSGTX PLUS DCT 1.4 PETROL,"Sunroof65,200 kmPetrol1st owner",₹13.21L,"Free Test DriveTomorrowatMulund West, Mumbai"
7,2020 KIA SELTOSGTX PLUS DCT 1.4 PETROL,"Sunroof35,055 kmPetrol1st owner",₹14.44L,"Free Test DriveTomorrowatGoregaon, Mumbai"
8,2021 KIA SELTOSGTX PLUS DCT 1.4 PETROL,"Sunroof33,374 kmPetrol2nd owner",₹14.13L,"Free Test DriveTomorrowatGoregaon, Mumbai"
9,2020 KIA SONETHTX PLUS 1.0 IMT,"Sunroof36,460 kmPetrol1st owner",₹9.95L,"Free Test DriveTomorrowatGoregaon, Mumbai"


In [13]:
df.head(10)

import pandas as pd

# Define the path to your CSV file
file_path = 'Kia_cars.csv'

# Define the columns to use
cols = ['Title', 'Details', 'Price', 'Location']

# Function to process each chunk
def process_chunk(chunk):
    # Extract Year of Manufacture (from Title)
    chunk['Year'] = chunk['Title'].str.extract(r'(\d{4})')

    # Extract Kilometers Driven
    chunk['Kilometers Driven'] = chunk['Details'].str.extract(r'(\d{1,3}(?:,\d{3})*)\s*km')

    # Extract Fuel Type
    chunk['Fuel Type'] = chunk['Details'].str.extract(r'(Petrol|Diesel)')

    # Extract Owner
    chunk['Owner'] = chunk['Details'].str.extract(r'(\d{1,2}st owner|1st owner|2nd owner|3rd owner)')

    # Remove the Year from the 'Title' column and rename it to 'Name'
    chunk['Title'] = chunk['Title'].str.replace(r'^\d{4}\s*', '', regex=True).str.strip()
    chunk.rename(columns={'Title': 'Name'}, inplace=True)

    return chunk

# Initialize an empty list to hold processed chunks
processed_chunks = []

# Process the CSV file in chunks
for chunk in pd.read_csv(file_path, usecols=cols, chunksize=10000):
    processed_chunk = process_chunk(chunk)
    processed_chunks.append(processed_chunk)

# Concatenate all chunks into a single DataFrame
final_df = pd.concat(processed_chunks, ignore_index=True)

# Save the final DataFrame to a CSV file
final_df.to_csv('Kia_cars_data.csv', index=False)

print("Data processing complete and saved to 'Kia_cars_data.csv'.")



Data processing complete and saved to 'Kia_cars_data.csv'.
