In [None]:

import time
import requests
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
from webdriver_manager.chrome import ChromeDriverManager
import json
import re


In [None]:

class Cars24TataScraper:
    def __init__(self, headless=True, target_count=215):
        self.target_count = target_count
        self.cars_data = []
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
            'Accept': 'application/json, text/plain, */*',
            'Accept-Language': 'en-US,en;q=0.9',
            'Accept-Encoding': 'gzip, deflate, br',
            'Connection': 'keep-alive',
            'Referer': 'https://www.cars24.com/buy-used-tata-cars-mumbai/',
            'Sec-Fetch-Dest': 'empty',
            'Sec-Fetch-Mode': 'cors',
            'Sec-Fetch-Site': 'same-origin'
        }
        
        # Setup Chrome options
        self.options = Options()
        if headless:
            self.options.add_argument("--headless=new")
        self.options.add_argument("--disable-gpu")
        self.options.add_argument("--window-size=1920,1080")
        self.options.add_argument("--no-sandbox")
        self.options.add_argument("--disable-dev-shm-usage")
        self.options.add_experimental_option("excludeSwitches", ["enable-automation"])
        self.options.add_experimental_option('useAutomationExtension', False)
        self.options.add_argument(f"user-agent={self.headers['User-Agent']}")
        
        # Initialize driver
        self.driver = None
        self.session = requests.Session()
        self.session.headers.update(self.headers)
    
    def init_driver(self):
        """Initialize Selenium driver"""
        try:
            service = Service(ChromeDriverManager().install())
            self.driver = webdriver.Chrome(service=service, options=self.options)
            self.driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
            print("✅ Chrome driver initialized")
            return True
        except Exception as e:
            print(f"❌ Failed to initialize driver: {e}")
            return False
    
    def try_api_approach(self):
        """Try to scrape using Cars24 internal API"""
        print("🚀 Trying API approach...")
        
        # Common Cars24 API endpoints (discovered through network analysis)
        api_endpoints = [
            "https://www.cars24.com/api/v1/search",
            "https://api.cars24.com/search/v1/cars",
            "https://www.cars24.com/search/api/v1/cars"
        ]
        
        params = {
            'brand': 'tata',
            'city': 'mumbai',
            'page': 1,
            'limit': 212,
            'sort': 'bestmatch'
        }
        
        for endpoint in api_endpoints:
            try:
                print(f"   Trying: {endpoint}")
                response = self.session.get(endpoint, params=params, timeout=10)
                
                if response.status_code == 200:
                    data = response.json()
                    if 'data' in data or 'cars' in data or 'results' in data:
                        print(f"✅ API endpoint working: {endpoint}")
                        return self.extract_from_api_response(data)
                        
            except Exception as e:
                print(f"   Failed: {str(e)[:212]}...")
                continue
        
        print("❌ API approach failed")
        return []
    
    def extract_from_api_response(self, data):
        """Extract car data from API response"""
        cars = []
        
        # Try different response structures
        car_list = None
        if 'data' in data:
            if isinstance(data['data'], list):
                car_list = data['data']
            elif 'cars' in data['data']:
                car_list = data['data']['cars']
            elif 'results' in data['data']:
                car_list = data['data']['results']
        elif 'cars' in data:
            car_list = data['cars']
        elif 'results' in data:
            car_list = data['results']
        elif isinstance(data, list):
            car_list = data
        
        if not car_list:
            return []
        
        for car in car_list[:self.target_count]:
            try:
                car_data = {
                    "Car Name": self.safe_get(car, ['name', 'title', 'model', 'displayName']),
                    "Year": self.safe_get(car, ['year', 'modelYear', 'registrationYear']),
                    "Kilometers Driven": self.safe_get(car, ['kmDriven', 'kilometers', 'km', 'odometer']),
                    "Fuel Type": self.safe_get(car, ['fuelType', 'fuel', 'fuelVariant']),
                    "Transmission": self.safe_get(car, ['transmission', 'gearbox', 'transmissionType']),
                    "Price": self.safe_get(car, ['price', 'amount', 'finalPrice', 'displayPrice'])
                }
                
                if car_data["Car Name"]:  # Only add if we have a car name
                    cars.append(car_data)
                    
            except Exception as e:
                continue
        
        print(f"✅ Extracted {len(cars)} cars from API")
        return cars
    
    def safe_get(self, data, keys):
        """Safely get value from nested dict using multiple possible keys"""
        for key in keys:
            if isinstance(data, dict) and key in data:
                value = data[key]
                if value is not None:
                    return str(value)
        return ""
    
    def try_selenium_approach(self):
        """Try scraping with Selenium and updated selectors"""
        print("🚀 Trying Selenium approach...")
        
        if not self.init_driver():
            return []
        
        try:
            url = "https://www.cars24.com/buy-used-tata-cars-mumbai/"
            self.driver.get(url)
            time.sleep(5)
            
            # Close popups
            self.close_popups()
            
            # Wait for content to load
            self.wait_for_content()
            
            # Try multiple scrolling and extraction rounds
            cars = []
            for round_num in range(10):  # Try up to 10 rounds
                print(f"   Round {round_num + 1}: Scrolling and extracting...")
                
                # Scroll to load more content
                self.smart_scroll()
                
                # Try different selector strategies
                new_cars = self.extract_cars_current_page()
                cars.extend(new_cars)
                
                # Remove duplicates
                seen = set()
                unique_cars = []
                for car in cars:
                    key = f"{car['Car Name']}{car['Price']}"
                    if key not in seen:
                        seen.add(key)
                        unique_cars.append(car)
                
                cars = unique_cars
                print(f"   Total unique cars: {len(cars)}")
                
                if len(cars) >= self.target_count:
                    break
                    
                if len(new_cars) == 0:
                    print("   No new cars found, trying different approach...")
                    # Try refreshing and different URL
                    alternative_url = "https://www.cars24.com/buy-used-cars-mumbai/?f=make%3Atata"
                    self.driver.get(alternative_url)
                    time.sleep(3)
                    continue
            
            return cars[:self.target_count]
            
        except Exception as e:
            print(f"❌ Selenium approach failed: {e}")
            return []
        finally:
            if self.driver:
                self.driver.quit()
    
    def close_popups(self):
        """Close popups and modals"""
        try:
            time.sleep(2)
            # Try clicking various popup close buttons
            close_selectors = [
                "button[aria-label='Close']",
                ".close-btn", ".close-button", ".modal-close",
                "button:contains('Accept')", "button:contains('Got it')",
                "[data-testid='close']", ".popup-close"
            ]
            
            for selector in close_selectors:
                try:
                    elements = self.driver.find_elements(By.CSS_SELECTOR, selector)
                    for elem in elements:
                        if elem.is_displayed():
                            self.driver.execute_script("arguments[0].click();", elem)
                            time.sleep(0.5)
                except:
                    continue
                    
        except Exception:
            pass
    
    def wait_for_content(self):
        """Wait for page content to load"""
        try:
            # Wait for any of these indicators that content has loaded
            selectors = [
                "[data-testid*='car']",
                ".car-card", ".listing-card", ".vehicle-card",
                "article", ".result-item",
                "[class*='CarCard']", "[class*='ListingCard']"
            ]
            
            for selector in selectors:
                try:
                    WebDriverWait(self.driver, 10).until(
                        EC.presence_of_element_located((By.CSS_SELECTOR, selector))
                    )
                    print(f"✅ Content loaded (found: {selector})")
                    return
                except:
                    continue
                    
            print("⚠️ Content loading timeout, proceeding anyway...")
            
        except Exception:
            pass
    
    def smart_scroll(self):
        """Intelligent scrolling to load dynamic content"""
        try:
            # Get current height
            last_height = self.driver.execute_script("return document.body.scrollHeight")
            
            # Scroll in steps
            for i in range(5):
                # Scroll down
                self.driver.execute_script(f"window.scrollTo(0, {(i+1) * last_height // 5});")
                time.sleep(1)
                
                # Check for "Load More" button
                try:
                    load_more = self.driver.find_element(By.XPATH, "//button[contains(text(), 'Load More') or contains(text(), 'Show More')]")
                    if load_more.is_displayed():
                        self.driver.execute_script("arguments[0].click();", load_more)
                        time.sleep(2)
                except:
                    pass
            
            # Final scroll to bottom
            self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(2)
            
        except Exception:
            pass
    
    def extract_cars_current_page(self):
        """Extract cars from current page using multiple strategies"""
        cars = []
        
        # Strategy 1: Look for modern React-based selectors
        selectors = [
            "[data-testid*='car-card']", "[data-testid*='used-car']",
            "[class*='CarCard']", "[class*='ListingCard']", "[class*='VehicleCard']",
            ".car-card", ".listing-card", ".vehicle-card",
            "article", ".result-item",
            "[data-qa*='car']", "[data-cy*='car']"
        ]
        
        elements_found = []
        for selector in selectors:
            try:
                elements = self.driver.find_elements(By.CSS_SELECTOR, selector)
                if elements:
                    print(f"   Found {len(elements)} elements with: {selector}")
                    elements_found.extend(elements)
            except:
                continue
        
        # Strategy 2: If no specific car elements, look for any divs with car-like content
        if not elements_found:
            all_divs = self.driver.find_elements(By.TAG_NAME, "div")
            for div in all_divs:
                try:
                    text = div.text.lower()
                    if ("tata" in text and "₹" in text and 
                        any(word in text for word in ["km", "petrol", "diesel", "manual", "automatic"])):
                        elements_found.append(div)
                except:
                    continue
        
        print(f"   Processing {len(elements_found)} potential car elements...")
        
        # Extract data from found elements
        for element in elements_found:
            try:
                car_data = self.extract_car_from_element(element)
                if car_data and car_data.get("Car Name"):
                    cars.append(car_data)
            except:
                continue
        
        return cars
    
    def extract_car_from_element(self, element):
        """Extract car data from a single element"""
        try:
            # Get all text from element
            full_text = element.text
            html_content = element.get_attribute('innerHTML')
            soup = BeautifulSoup(html_content, 'html.parser')
            
            # Extract car name (look for headings or emphasized text)
            name = ""
            for tag in ['h1', 'h2', 'h3', 'h4', 'strong', 'b']:
                elem = soup.find(tag)
                if elem and elem.get_text(strip=True):
                    name = elem.get_text(strip=True)
                    break
            
            if not name:
                # Try to find Tata model name in text
                match = re.search(r'Tata\s+([A-Za-z]+(?:\s+[A-Za-z]+)?)', full_text, re.IGNORECASE)
                if match:
                    name = match.group(0)
            
            # Extract price
            price_match = re.search(r'₹\s*([\d,]+(?:\.\d+)?)', full_text)
            price = price_match.group(0) if price_match else ""
            
            # Extract year
            year_match = re.search(r'(20\d{2}|19\d{2})', full_text)
            year = year_match.group(0) if year_match else ""
            
            # Extract kilometers
            km_match = re.search(r'([\d,]+)\s*km', full_text, re.IGNORECASE)
            km = km_match.group(1) if km_match else ""
            
            # Extract fuel type
            fuel_match = re.search(r'(Petrol|Diesel|CNG|Electric)', full_text, re.IGNORECASE)
            fuel = fuel_match.group(0) if fuel_match else ""
            
            # Extract transmission
            trans_match = re.search(r'(Manual|Automatic)', full_text, re.IGNORECASE)
            transmission = trans_match.group(0) if trans_match else ""
            
            return {
                "Car Name": name,
                "Year": year,
                "Kilometers Driven": km.replace(',', ''),
                "Fuel Type": fuel,
                "Transmission": transmission,
                "Price": price
            }
            
        except Exception:
            return None
    
    def scrape_cars(self):
        """Main scraping method"""
        print(f"🎯 Target: {self.target_count} Tata cars in Mumbai")
        
        # Try API approach first
        cars = self.try_api_approach()
        
        # If API fails, try Selenium
        if len(cars) < 212:  # If we got very few cars from API
            print("🔄 Switching to Selenium approach...")
            selenium_cars = self.try_selenium_approach()
            cars.extend(selenium_cars)
        
        # Remove duplicates
        seen = set()
        unique_cars = []
        for car in cars:
            key = f"{car['Car Name']}{car['Price']}"
            if key not in seen and car['Car Name']:
                seen.add(key)
                unique_cars.append(car)
        
        self.cars_data = unique_cars[:self.target_count]
        print(f"✅ Successfully extracted {len(self.cars_data)} unique cars")
        
        return self.cars_data
    
    def save_to_csv(self, filename="Tata_Cars_Mumbai_Working.csv"):
        """Save data to CSV"""
        if not self.cars_data:
            print("❌ No data to save")
            return None
        
        df = pd.DataFrame(self.cars_data)
        
        # Clean data
        df["Kilometers Driven"] = pd.to_numeric(df["Kilometers Driven"].str.replace(',', ''), errors='coerce')
        df["Price"] = df["Price"].str.replace('₹', '').str.replace(',', '')
        df["Price"] = pd.to_numeric(df["Price"], errors='coerce')
        
        # Save to CSV
        df.to_csv(filename, index=False, encoding='utf-8-sig')
        print(f"💾 Saved {len(df)} cars to {filename}")
        
        # Show sample
        print("\n📋 Sample data:")
        print(df.head(10).to_string(index=False))
        
        return df


In [None]:

def main():
    """Main execution"""
    try:
        # Initialize scraper
        scraper = Cars24TataScraper(headless=True, target_count=215)
        
        # Scrape cars
        cars = scraper.scrape_cars()
        
        # Save to CSV
        df = scraper.save_to_csv("Tata_Cars_Mumbai_Fixed.csv")
        
        if df is not None and len(df) > 0:
            print(f"\n🎉 SUCCESS! {len(df)} Tata cars scraped from Mumbai")
            print(f"📊 Data includes: {df.columns.tolist()}")
        else:
            print("\n❌ No cars were scraped. The website might have changed structure.")
            print("Try running with headless=False to see what's happening:")
            print("scraper = Cars24TataScraper(headless=False, target_count=215)")
            
    except Exception as e:
        print(f"❌ Error: {e}")

if __name__ == "__main__":
    main()
