In [4]:
%pip install selenium

Collecting selenium
  Downloading selenium-4.34.0-py3-none-any.whl.metadata (7.5 kB)
Collecting urllib3~=2.4.0 (from urllib3[socks]~=2.4.0->selenium)
  Downloading urllib3-2.4.0-py3-none-any.whl.metadata (6.5 kB)
Collecting trio~=0.30.0 (from selenium)
  Downloading trio-0.30.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket~=0.12.2 (from selenium)
  Downloading trio_websocket-0.12.2-py3-none-any.whl.metadata (5.1 kB)
Collecting certifi>=2025.4.26 (from selenium)
  Downloading certifi-2025.6.15-py3-none-any.whl.metadata (2.4 kB)
Collecting typing_extensions~=4.14.0 (from selenium)
  Downloading typing_extensions-4.14.1-py3-none-any.whl.metadata (3.0 kB)
Collecting attrs>=23.2.0 (from trio~=0.30.0->selenium)
  Downloading attrs-25.3.0-py3-none-any.whl.metadata (10 kB)
Collecting outcome (from trio~=0.30.0->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.12.2->selenium)
  Downloading wsproto-

In [5]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.keys import Keys
import pandas as pd
import time

service = Service()  # assumes chromedriver is in PATH
driver = webdriver.Chrome(service=service)

url = "https://www.cars24.com/buy-used-car/?sort=bestmatch&serveWarrantyCount=true"
driver.get(url)
time.sleep(3)

start_time = time.time()
max_duration = 600   # ⏱️ stop after ~90 seconds if not finished
scroll_pause = 2    # seconds to wait after each scroll
last_height = driver.execute_script("return document.body.scrollHeight")

print("📜 Starting dynamic scroll…")

while True:
    # Scroll to bottom
    driver.find_element(By.TAG_NAME, "body").send_keys(Keys.END)
    time.sleep(scroll_pause)

    # Check if new content loaded
    new_height = driver.execute_script("return document.body.scrollHeight")


    if time.time() - start_time > max_duration:
        print("⏳ Timeout reached.")
        break

print("🔍 Extracting car data…")

cars = driver.find_elements(By.CSS_SELECTOR, "div.styles_normalCardWrapper__qDZjq")

print(f"🚗 Found {len(cars)} cars.")

data = []

for car in cars:
    try:
        brand = car.find_element(By.CSS_SELECTOR, "span.sc-braxZu.kjFjan").text.strip()
        model = car.find_element(By.CSS_SELECTOR, "span.sc-braxZu.fccwQo").text.strip()
        price = car.find_element(By.CSS_SELECTOR, "p.sc-braxZu.cyPhJl").text.strip()
        details = car.find_elements(By.CSS_SELECTOR, "p.sc-braxZu.kvfdZL")
        kms_driven = details[0].text.strip()
        fuel_type= details[1].text.strip()
        transmission=details[2].text.strip()
        ownership=details[3].text.strip()
        rating=details[4].text.strip()
        state = car.find_element(By.CSS_SELECTOR, "p.sc-braxZu.lmmumg").text.strip()

        data.append({
            "Brand": brand,
            "Model": model,
            "Price": price,
            "Kms_driven": kms_driven, "Fuel_type" : fuel_type, "Transmission": transmission, "Ownership": ownership,
            "Rating": rating,
            "State": state,
        })

    except Exception as e:
        print("⚠️ Error extracting car:", e)

driver.quit()

# Save DataFrame
if data:
    df = pd.DataFrame(data)
    df.to_csv("cars24_scraped_data.csv", index=False)
    print(f"✅ Data saved to cars24_scraped_data.csv with {len(df)} rows.")
    print(df.head())
else:
    print("❌ No data scraped!")


📜 Starting dynamic scroll…
⏳ Timeout reached.
🔍 Extracting car data…
🚗 Found 1820 cars.
✅ Data saved to cars24_scraped_data.csv with 1820 rows.
                  Brand            Model        Price Kms_driven Fuel_type  \
0      2011 Maruti Ritz              VXI   ₹1.49 lakh  92.76k km    Petrol   
1  2022 Mahindra XUV700  AX 5 P AT 5 STR  ₹12.70 lakh  16.75k km    Petrol   
2  2014 Maruti Alto 800              VXI   ₹2.30 lakh  98.33k km    Petrol   
3  2018 Maruti Alto K10              VXI   ₹2.87 lakh  42.03k km    Petrol   
4       2020 Tata NEXON        XM PETROL   ₹6.00 lakh  50.50k km    Petrol   

  Transmission  Ownership           Rating State  
0       Manual  1st owner   Verified Owner        
1         Auto  2nd owner                         
2       Manual  2nd owner  Verified Dealer        
3       Manual  1st owner   CARS24 Assured        
4       Manual  1st owner   Verified Owner        


In [None]:
#now come the cleaning part

In [2]:
import pandas as pd
import numpy as np

df=pd.read_csv(r"C:\Users\wwwja\OneDrive\Documents\cars24_scraped_data.csv")
data=df.copy()
data.head()

Unnamed: 0,Brand,Model,Price,Kms_driven,Fuel_type,Transmission,Ownership,Rating,State
0,2011 Maruti Ritz,VXI,₹1.49 lakh,92.76k km,Petrol,Manual,1st owner,Verified Owner,
1,2022 Mahindra XUV700,AX 5 P AT 5 STR,₹12.70 lakh,16.75k km,Petrol,Auto,2nd owner,,
2,2014 Maruti Alto 800,VXI,₹2.30 lakh,98.33k km,Petrol,Manual,2nd owner,Verified Dealer,
3,2018 Maruti Alto K10,VXI,₹2.87 lakh,42.03k km,Petrol,Manual,1st owner,CARS24 Assured,
4,2020 Tata NEXON,XM PETROL,₹6.00 lakh,50.50k km,Petrol,Manual,1st owner,Verified Owner,


In [3]:
a=data["Brand"].str.split(" ")
data["Company"]=a.str[1]
data["Modell"]=a.str[2]
data["Year"]=a.str[0].astype(int)

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1820 entries, 0 to 1819
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Brand         1820 non-null   object
 1   Model         1544 non-null   object
 2   Price         1820 non-null   object
 3   Kms_driven    1820 non-null   object
 4   Fuel_type     1820 non-null   object
 5   Transmission  1820 non-null   object
 6   Ownership     1820 non-null   object
 7   Rating        1814 non-null   object
 8   State         1799 non-null   object
 9   Company       1820 non-null   object
 10  Modell        1820 non-null   object
 11  Year          1820 non-null   int32 
dtypes: int32(1), object(11)
memory usage: 163.6+ KB


In [4]:
data["Model"].unique().shape


(518,)

In [5]:
#now remove the price errors
data["Price"]=data["Price"].str.split(" ").str[0].str.replace("₹", "").str.replace(".", "").astype(int)*1000
data.head()

Unnamed: 0,Brand,Model,Price,Kms_driven,Fuel_type,Transmission,Ownership,Rating,State,Company,Modell,Year
0,2011 Maruti Ritz,VXI,149000,92.76k km,Petrol,Manual,1st owner,Verified Owner,,Maruti,Ritz,2011
1,2022 Mahindra XUV700,AX 5 P AT 5 STR,1270000,16.75k km,Petrol,Auto,2nd owner,,,Mahindra,XUV700,2022
2,2014 Maruti Alto 800,VXI,230000,98.33k km,Petrol,Manual,2nd owner,Verified Dealer,,Maruti,Alto,2014
3,2018 Maruti Alto K10,VXI,287000,42.03k km,Petrol,Manual,1st owner,CARS24 Assured,,Maruti,Alto,2018
4,2020 Tata NEXON,XM PETROL,600000,50.50k km,Petrol,Manual,1st owner,Verified Owner,,Tata,NEXON,2020


In [6]:
data["Kms_driven"]=data["Kms_driven"].str.split(" ").str[0].str.replace("k", "0").str.replace("L", "0000").str.replace(".", "").astype(int)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1820 entries, 0 to 1819
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Brand         1820 non-null   object
 1   Model         1544 non-null   object
 2   Price         1820 non-null   int32 
 3   Kms_driven    1820 non-null   int32 
 4   Fuel_type     1820 non-null   object
 5   Transmission  1820 non-null   object
 6   Ownership     1820 non-null   object
 7   Rating        1814 non-null   object
 8   State         1799 non-null   object
 9   Company       1820 non-null   object
 10  Modell        1820 non-null   object
 11  Year          1820 non-null   int32 
dtypes: int32(3), object(9)
memory usage: 149.4+ KB


In [7]:
# to check the uniqueness of ownership and converting it to label encoding for ml
data["OwnershipLE"]=data["Ownership"].str[0].astype(int)
data.info()
data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1820 entries, 0 to 1819
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Brand         1820 non-null   object
 1   Model         1544 non-null   object
 2   Price         1820 non-null   int32 
 3   Kms_driven    1820 non-null   int32 
 4   Fuel_type     1820 non-null   object
 5   Transmission  1820 non-null   object
 6   Ownership     1820 non-null   object
 7   Rating        1814 non-null   object
 8   State         1799 non-null   object
 9   Company       1820 non-null   object
 10  Modell        1820 non-null   object
 11  Year          1820 non-null   int32 
 12  OwnershipLE   1820 non-null   int32 
dtypes: int32(4), object(9)
memory usage: 156.5+ KB


Unnamed: 0,Brand,Model,Price,Kms_driven,Fuel_type,Transmission,Ownership,Rating,State,Company,Modell,Year,OwnershipLE
0,2011 Maruti Ritz,VXI,149000,92760,Petrol,Manual,1st owner,Verified Owner,,Maruti,Ritz,2011,1
1,2022 Mahindra XUV700,AX 5 P AT 5 STR,1270000,16750,Petrol,Auto,2nd owner,,,Mahindra,XUV700,2022,2
2,2014 Maruti Alto 800,VXI,230000,98330,Petrol,Manual,2nd owner,Verified Dealer,,Maruti,Alto,2014,2
3,2018 Maruti Alto K10,VXI,287000,42030,Petrol,Manual,1st owner,CARS24 Assured,,Maruti,Alto,2018,1
4,2020 Tata NEXON,XM PETROL,600000,50500,Petrol,Manual,1st owner,Verified Owner,,Tata,NEXON,2020,1


In [8]:
data["Rating"] = data["Rating"].fillna("Unverified")
data["State"] = data["State"].fillna("Unknown")
data["State"]=data["State"].str.split(",").str[-1].str.split(" ").str[-1]
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1820 entries, 0 to 1819
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Brand         1820 non-null   object
 1   Model         1544 non-null   object
 2   Price         1820 non-null   int32 
 3   Kms_driven    1820 non-null   int32 
 4   Fuel_type     1820 non-null   object
 5   Transmission  1820 non-null   object
 6   Ownership     1820 non-null   object
 7   Rating        1820 non-null   object
 8   State         1820 non-null   object
 9   Company       1820 non-null   object
 10  Modell        1820 non-null   object
 11  Year          1820 non-null   int32 
 12  OwnershipLE   1820 non-null   int32 
dtypes: int32(4), object(9)
memory usage: 156.5+ KB


In [9]:
# 1️⃣ Drop the unwanted columns

# 2️⃣ Reorder the columns
desired_order = [
    'Company', 'Modell', 'Model' ,"Year",
    'Price', 'Kms_driven', 'Fuel_type', 'Transmission','Ownership', 'OwnershipLE', 'Rating', 'State'
]

# if all columns exist in the dataframe:
data = data[desired_order]

# If some columns might be missing, you can do this to avoid errors:
data = data[[col for col in desired_order if col in data.columns]]
data = data.rename(columns={'Model': 'Varient'})

# step 2: rename 'modell' → 'model'
data = data.rename(columns={'Modell': 'Model'})
data.head(10)


Unnamed: 0,Company,Model,Varient,Year,Price,Kms_driven,Fuel_type,Transmission,Ownership,OwnershipLE,Rating,State
0,Maruti,Ritz,VXI,2011,149000,92760,Petrol,Manual,1st owner,1,Verified Owner,Unknown
1,Mahindra,XUV700,AX 5 P AT 5 STR,2022,1270000,16750,Petrol,Auto,2nd owner,2,Unverified,Unknown
2,Maruti,Alto,VXI,2014,230000,98330,Petrol,Manual,2nd owner,2,Verified Dealer,Unknown
3,Maruti,Alto,VXI,2018,287000,42030,Petrol,Manual,1st owner,1,CARS24 Assured,Unknown
4,Tata,NEXON,XM PETROL,2020,600000,50500,Petrol,Manual,1st owner,1,Verified Owner,Unknown
5,Renault,Kwid,RXT 0.8,2018,282000,74760,Petrol,Manual,1st owner,1,Unverified,Unknown
6,Tata,PUNCH,PURE MT,2022,547000,41190,Petrol,Manual,1st owner,1,Verified Owner,Unknown
7,Maruti,Alto,LXI,2016,203000,25780,Petrol,Manual,1st owner,1,CARS24 Assured,Unknown
8,Renault,Kwid,CLIMBER AMT 1.0,2023,499000,5710,Petrol,Auto,1st owner,1,Unverified,Unknown
9,Volkswagen,Vento,COMFORTLINE 1.6,2014,325000,49660,Petrol,Manual,1st owner,1,CARS24 Assured,Unknown


In [11]:
from datetime import datetime

data['Car_Age'] = datetime.now().year - data['Year']

# Bucket kms driven
bins = [0, 20000, 50000, 100000, float('inf')]
labels = ['Low', 'Medium', 'High', 'Very High']
data['Kms_Bucket'] = pd.cut(data['Kms_driven'], bins=bins, labels=labels)

# Log-transform skewed columns
data['Price_log'] = np.log1p(data['Price'])
data['Kms_driven_log'] = np.log1p(data['Kms_driven'])


In [12]:
data.to_csv("cleaned_cars_24data.csv", index=False)
data.describe()

Unnamed: 0,Year,Price,Kms_driven,OwnershipLE,Car_Age,Price_log,Kms_driven_log
count,1820.0,1820.0,1820.0,1820.0,1820.0,1820.0,1820.0
mean,2016.532418,426958.2,77332.67,1.612088,8.467582,12.748575,11.019497
std,3.812041,297954.5,71725.21,0.888549,3.812041,0.669805,0.75378
min,2007.0,35000.0,377.0,1.0,0.0,10.463132,5.934894
25%,2014.0,215000.0,43372.5,1.0,6.0,12.278398,10.677604
50%,2016.0,353000.0,70415.0,1.0,9.0,12.774226,11.162176
75%,2019.0,554250.0,95225.0,2.0,11.0,13.225373,11.464008
max,2025.0,2750000.0,2000000.0,7.0,18.0,14.827112,14.508658


In [13]:
data.head()

Unnamed: 0,Company,Model,Varient,Year,Price,Kms_driven,Fuel_type,Transmission,Ownership,OwnershipLE,Rating,State,Car_Age,Kms_Bucket,Price_log,Kms_driven_log
0,Maruti,Ritz,VXI,2011,149000,92760,Petrol,Manual,1st owner,1,Verified Owner,Unknown,14,High,11.911708,11.437782
1,Mahindra,XUV700,AX 5 P AT 5 STR,2022,1270000,16750,Petrol,Auto,2nd owner,2,Unverified,Unknown,3,Low,14.054528,9.726213
2,Maruti,Alto,VXI,2014,230000,98330,Petrol,Manual,2nd owner,2,Verified Dealer,Unknown,11,High,12.345839,11.496095
3,Maruti,Alto,VXI,2018,287000,42030,Petrol,Manual,1st owner,1,CARS24 Assured,Unknown,7,Medium,12.567241,10.646163
4,Tata,NEXON,XM PETROL,2020,600000,50500,Petrol,Manual,1st owner,1,Verified Owner,Unknown,5,High,13.304687,10.829748


In [14]:
y=data["Price_log"]

x=data.drop(columns={"Price", "Ownership", "Varient", "Year", "Kms_driven", "Rating", "State", "Kms_Bucket", "Price_log"})


In [15]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.preprocessing import OneHotEncoder

x_train, x_test, y_train, y_test= train_test_split(x, y, test_size=0.4)

In [16]:
ohe=OneHotEncoder()
ohe.fit(x[["Company", "Model", "Fuel_type", "Transmission"]])

In [19]:
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline

column_trans = make_column_transformer(
    (OneHotEncoder(categories=ohe.categories_), ["Company", "Model", "Fuel_type", "Transmission"]),
    remainder="passthrough"
)


In [20]:
lr=LinearRegression()

pipe=make_pipeline(column_trans, lr)
pipe.fit(x_train, y_train)

In [21]:
y_pred=pipe.predict(x_test)

print(r2_score(y_test, y_pred))



0.8429126685386328


In [22]:
scores=[]
for i in range(1000):
    x_train, x_test, y_train, y_test= train_test_split(x,y, test_size=0.2, random_state=i)
    lr=LinearRegression()
    pipe=make_pipeline(column_trans, lr)
    pipe.fit(x_train, y_train)
    y_pred=pipe.predict(x_test)
    scores.append(r2_score(y_test, y_pred))

np.argmax(scores)

343

In [23]:
scores[343]

0.8915981541744225

In [24]:
x_train, x_test, y_train, y_test= train_test_split(x,y, test_size=0.2, random_state=np.argmax(scores))
lr=LinearRegression()
pipe=make_pipeline(column_trans, lr)
pipe.fit(x_train, y_train)
y_pred=pipe.predict(x_test)
r2_score(y_test, y_pred)

0.8915981541744225

In [25]:
x.head()

Unnamed: 0,Company,Model,Fuel_type,Transmission,OwnershipLE,Car_Age,Kms_driven_log
0,Maruti,Ritz,Petrol,Manual,1,14,11.437782
1,Mahindra,XUV700,Petrol,Auto,2,3,9.726213
2,Maruti,Alto,Petrol,Manual,2,11,11.496095
3,Maruti,Alto,Petrol,Manual,1,7,10.646163
4,Tata,NEXON,Petrol,Manual,1,5,10.829748


In [26]:

from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=100, random_state=42)

pipe = make_pipeline(column_trans, rf)
pipe.fit(x_train, y_train)

y_pred = pipe.predict(x_test)

r2 = r2_score(y_test, y_pred)
print(r2)


0.8473945333221035
