In [None]:
import logging
import os
import re
import time
from typing import Dict, List, Optional, Tuple
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from webdriver_manager.chrome import ChromeDriverManager

# Configure logging
logging.basicConfig(
    level=logging.WARNING,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('../../log/crawling/crawler.log'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

class Config:
    """Configuration class for crawler settings"""
    CSV_PATH = "../../data/external/google_gangnam_crawling_data.csv"
    WAIT_TIMEOUT = 3
    MAX_SCROLL_ATTEMPTS = 50
    MAX_NO_NEW_REVIEWS = 3
    MAX_REVIEWS = 100
    SLEEP_INTERVAL = 0.3
    GOOGLE_MAPS_URL = "https://www.google.com/maps"
    DININGCODE_URL = "https://www.diningcode.com/list.dc?query={}"
    
class RestaurantCrawler:
    def __init__(self, headless: bool = False):
        """Initialize the crawler with Chrome driver and data storage"""
        self.config = Config()
        self.setup_driver(headless)
        self.wait = WebDriverWait(self.driver, self.config.WAIT_TIMEOUT)
        self.initialize_data()
        
    def setup_driver(self, headless: bool) -> None:
        """Setup Chrome WebDriver with options"""
        chrome_options = Options()
        chrome_options.add_argument("--start-maximized")
        if headless:
            chrome_options.add_argument("--headless")
        try:
            self.driver = webdriver.Chrome(
                service=Service(ChromeDriverManager().install()),
                options=chrome_options
            )
            logger.info("Chrome driver initialized successfully")
        except Exception as e:
            logger.error(f"Failed to initialize Chrome driver: {e}")
            raise

    def initialize_data(self) -> None:
        """Initialize or load restaurant data from CSV"""
        columns = [
            '음식점_이름', '주소', '전화번호', '음식점_태그', '메뉴_정보',
            '카테고리', '음식점_사진', '위도', '경도', '영업시간', '리뷰'
        ]
        if os.path.exists(self.config.CSV_PATH):
            self.restaurants_df = pd.read_csv(self.config.CSV_PATH)
            logger.info(f"Loaded existing CSV from {self.config.CSV_PATH}")
        else:
            self.restaurants_df = pd.DataFrame(columns=columns)
            logger.info("Created new empty DataFrame")

    def search_google_maps(self, search_query: str) -> Dict:
        """Search Google Maps for restaurant information"""
        try:
            self.driver.get(self.config.GOOGLE_MAPS_URL)
            self._input_search_query(search_query)
            
            if "/place/" not in self.driver.current_url:
                self._click_first_result()

            return {
                '음식점_이름': self._get_restaurant_name(search_query),
                '카테고리': self._get_category(),
                '음식점_사진': self._get_photo_url(),
                '영업시간': self._get_business_hours(),
                '위도': self._get_coordinates()[0],
                '경도': self._get_coordinates()[1],
                '리뷰': self._get_all_reviews()
            }
        except Exception as e:
            logger.error(f"Google Maps crawling error for {search_query}: {e}")
            return self._get_default_info(search_query)

    def _input_search_query(self, query: str) -> None:
        """Input search query into Google Maps search box"""
        clean_query = re.sub(r'^.*\(주\)', '', query)
        # 불필요한 공백 제거
        clean_query = clean_query.strip()

        search_box = self.wait.until(EC.presence_of_element_located((By.ID, "searchboxinput")))
        search_box.clear()
        search_box.send_keys(clean_query)
        search_box.send_keys(Keys.ENTER)
        time.sleep(1)  # Wait for results to load

    def _click_first_result(self) -> None:
        """Click the first search result"""
        selectors = [
            "a.hfpxzc", "div.Nv2PK", "div[jsaction*='mouseover']", "div.THOPZb"
        ]
        for selector in selectors:
            try:
                results = self.wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, selector)))
                if results:
                    self.driver.execute_script("arguments[0].click();", results[0])
                    time.sleep(1)
                    break
            except:
                continue
        else:
            logger.warning("Could not find first result to click")

    def _get_restaurant_name(self, default: str) -> str:
        """Extract restaurant name"""
        try:
            name = self.wait.until(EC.presence_of_element_located(
                (By.CSS_SELECTOR, "h1.DUwDvf.lfPIob"))).text
            return re.sub(r'\s*\([^)]*\)', '', name)
        except:
            return default

    def _get_category(self) -> str:
        """Extract restaurant category"""
        elements = self.driver.find_elements(By.CSS_SELECTOR, "button.DkEaL")
        if elements:
            return elements[0].text
        return ""

    def _get_photo_url(self) -> str:
        """Extract restaurant photo URL"""
        selectors = ["button[aria-label*='사진'] img", ".RZ66Rb img[decoding='async']"]
        for selector in selectors:
            try:
                return self.wait.until(EC.presence_of_element_located(
                    (By.CSS_SELECTOR, selector))).get_attribute("src")
            except:
                continue
        return ""

    def _get_business_hours(self) -> Dict:
        """Extract business hours"""
        try:
            hours_button = self.wait.until(EC.element_to_be_clickable(
                (By.CSS_SELECTOR, "div.OMl5r[aria-expanded='false']")))
            hours_button.click()
            time.sleep(0.5)
            
            hours_table = self.wait.until(EC.presence_of_element_located(
                (By.CSS_SELECTOR, "table.eK4R0e")))
            return self._parse_business_hours(hours_table)
        except Exception as e:
            logger.warning(f"Failed to extract business hours: {e}")
            return {}

    def _parse_business_hours(self, table) -> Dict:
        """Parse business hours table into dictionary"""
        hours = {}
        for row in table.find_elements(By.CSS_SELECTOR, "tr.y0skZc"):
            try:
                day = row.find_element(By.CSS_SELECTOR, "td.ylH6lf div").text
                hours_list = [h.text for h in row.find_elements(By.CSS_SELECTOR, "li.G8aQO")]
                hours[day] = hours_list
            except:
                continue
        return hours

    def _get_coordinates(self) -> Tuple[Optional[float], Optional[float]]:
        """Extract coordinates from URL"""
        try:
            url = self.driver.current_url
            patterns = [r"!3d([-\d\.]+)!4d([-\d\.]+)", r"@([-\d\.]+),([-\d\.]+)"]
            for pattern in patterns:
                match = re.search(pattern, url)
                if match:
                    return float(match.group(1)), float(match.group(2))
            return None, None
        except Exception as e:
            logger.warning(f"Failed to extract coordinates: {e}")
            return None, None

    def _get_all_reviews(self) -> List[Dict]:
        """Collect all reviews"""
        try:
            self._navigate_to_reviews_tab()
            scroll_container = self._find_scroll_container()
            if not scroll_container:
                return []

            reviews = []
            collected_reviews = set()
            scroll_attempts = 0
            no_new_reviews = 0

            while scroll_attempts < self.config.MAX_SCROLL_ATTEMPTS:
                review_elements = self.driver.find_elements(By.CSS_SELECTOR, "div.jftiEf")
                new_reviews_found = False

                for review in review_elements:
                    review_data = self._process_review(review)
                    if review_data and review_data['id'] not in collected_reviews:
                        reviews.append(review_data['data'])
                        collected_reviews.add(review_data['id'])
                        new_reviews_found = True

                if not new_reviews_found:
                    no_new_reviews += 1
                    if no_new_reviews >= self.config.MAX_NO_NEW_REVIEWS:
                        break
                else:
                    no_new_reviews = 0

                self._scroll_container(scroll_container)
                scroll_attempts += 1
                if len(reviews) >= self.config.MAX_REVIEWS:
                    break

            logger.info(f"Collected {len(reviews)} reviews")
            return reviews
        except Exception as e:
            logger.error(f"Review collection error: {e}")
            return []

    def _navigate_to_reviews_tab(self) -> None:
        """Navigate to reviews tab"""
        try:
            review_tab = self.wait.until(EC.element_to_be_clickable(
                (By.CSS_SELECTOR, '[role="tab"][aria-label*="리뷰"], [role="tab"][aria-label*="Reviews"]')))
            self.driver.execute_script("arguments[0].click();", review_tab)
            time.sleep(1)
        except Exception as e:
            logger.warning(f"Failed to navigate to reviews tab: {e}")

    def _find_scroll_container(self) -> Optional[webdriver.Chrome]:
        """Find scrollable review container"""
        selectors = [
            "div.m6QErb.DxyBCb.kA9KIf.dS8AEf.XiKgde",
            "div.m6QErb.XiKgde",
            "div.m6QErb",
            "div[role='feed']"
        ]
        for selector in selectors:
            try:
                return self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, selector)))
            except:
                continue
        logger.warning("Could not find scroll container")
        return None

    def _process_review(self, review) -> Optional[Dict]:
        """Process individual review"""
        try:
            try:
                more_button = review.find_element(By.CSS_SELECTOR, "button.w8nwRe.kyuRq")
                self.driver.execute_script("arguments[0].click();", more_button)
                time.sleep(self.config.SLEEP_INTERVAL)
            except:
                pass

            reviewer = review.find_element(By.CSS_SELECTOR, "div.d4r55").text
            content = review.find_element(By.CSS_SELECTOR, "span.wiI7pd").text
            photos = self._get_review_photos(review)
            review_id = f"{reviewer}:{content[:30]}"

            return {
                'id': review_id,
                'data': {
                    'reviewer': reviewer,
                    'content': content,
                    'photo': photos
                }
            }
        except Exception as e:
            logger.warning(f"Error processing review: {e}")
            return None

    def _get_review_photos(self, review) -> List[str]:
        """Extract review photos"""
        try:
            photo_container = review.find_element(By.CSS_SELECTOR, "div.KtCyie")
            photo_buttons = photo_container.find_elements(By.CSS_SELECTOR, "button.Tya61d")[:2]
            return [
                re.search(r'url\("([^"]+)"\)', btn.get_attribute("style")).group(1)
                for btn in photo_buttons
                if "background-image: url(" in btn.get_attribute("style")
            ]
        except:
            return []

    def _scroll_container(self, container) -> None:
        """Scroll container to load more content"""
        try:
            self.driver.execute_script(
                "arguments[0].scrollTop = arguments[0].scrollHeight", container)
            time.sleep(self.config.SLEEP_INTERVAL)
        except:
            actions = ActionChains(self.driver)
            actions.move_to_element(container).send_keys(Keys.PAGE_DOWN).perform()
            time.sleep(self.config.SLEEP_INTERVAL)

    def search_diningcode(self, restaurant_name: str) -> Dict:
        """Search DiningCode for restaurant information"""
        try:
            clean_name = self._clean_restaurant_name(restaurant_name)
            self.driver.get(self.config.DININGCODE_URL.format(clean_name))
            
            first_result = self.wait.until(EC.presence_of_element_located(
                (By.CSS_SELECTOR, "a[id^='block']")))
            self.driver.get(first_result.get_attribute('href'))

            return {
                '주소': self._get_diningcode_address(),
                '전화번호': self._get_diningcode_phone(),
                '메뉴_정보': self._get_diningcode_menu(),
                '음식점_태그': self._get_diningcode_tags()
            }
        except Exception as e:
            logger.error(f"DiningCode crawling error for {restaurant_name}: {e}")
            return {'주소': "", '전화번호': "", '메뉴_정보': [], '음식점_태그': []}

    def _clean_restaurant_name(self, name: str) -> str:
        """Clean restaurant name for search"""
        return name.rstrip()[:name.rfind("점")] if "점" in name else name.rstrip()

    def _get_diningcode_address(self) -> str:
        """Extract address from DiningCode"""
        try:
            locat = self.wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'locat')))
            address_parts = [a.text for a in locat.find_elements(By.TAG_NAME, 'a')]
            address_parts.append(locat.find_element(By.TAG_NAME, 'span').text.strip())
            return ' '.join(filter(None, address_parts))
        except:
            return ""

    def _get_diningcode_phone(self) -> str:
        """Extract phone number from DiningCode"""
        try:
            return self.wait.until(EC.presence_of_element_located(
                (By.CLASS_NAME, 'tel'))).text.strip()
        except:
            return ""

    def _get_diningcode_menu(self) -> List[Dict]:
        """Extract menu information from DiningCode"""
        menu_list = []
        try:
            menu_section = self.wait.until(EC.presence_of_element_located(
                (By.CSS_SELECTOR, "div.menu-info")))
            self.driver.execute_script("arguments[0].scrollIntoView();", menu_section)
            
            self._click_more_button("a.more-btn", 1)
            
            for item in self.driver.find_elements(By.CSS_SELECTOR, "ul.Restaurant_MenuList li"):
                try:
                    name = item.find_element(By.CSS_SELECTOR, "span.Restaurant_Menu").text
                    price = item.find_element(By.CSS_SELECTOR, "p.r-txt").text
                    menu_list.append({'menu_name': name, 'menu_price': price})
                except:
                    continue
        except Exception as e:
            logger.warning(f"Menu extraction error: {e}")
        return menu_list

    def _get_diningcode_tags(self) -> List[Dict]:
        """Extract tags from DiningCode"""
        tags_list = []
        try:
            tag_section = self.wait.until(EC.presence_of_element_located(
                (By.CSS_SELECTOR, "ul.app-arti")))
            self.driver.execute_script("arguments[0].scrollIntoView();", tag_section)
            
            mood_category = tag_section.find_element(
                By.XPATH, "//li[contains(span[@class='btxt'], '분위기')]")
            self._click_more_button("span.more-btn.button", 0, mood_category)
            
            for tag in mood_category.find_elements(By.CSS_SELECTOR, "span.icon"):
                if "more-btn" not in tag.get_attribute("class"):
                    tag_text = tag.text.strip()
                    if "(" in tag_text and ")" in tag_text:
                        name = tag_text[:tag_text.rfind("(")].strip()
                        count = int(tag_text[tag_text.rfind("(")+1:tag_text.rfind(")")].strip() or 0)
                        tags_list.append({'tags': name, 'count': count})
        except Exception as e:
            logger.warning(f"Tags extraction error: {e}")
        return tags_list

    def _click_more_button(self, selector: str, index: int, parent=None) -> None:
        """Click more button if available"""
        try:
            elements = (parent or self.driver).find_elements(By.CSS_SELECTOR, selector)
            if len(elements) > index and elements[index].is_displayed():
                self.driver.execute_script("arguments[0].click();", elements[index])
                time.sleep(self.config.SLEEP_INTERVAL)
        except:
            pass

    def _get_default_info(self, name: str) -> Dict:
        """Return default empty information"""
        return {
            '음식점_이름': name, '카테고리': '', '음식점_사진': '', '영업시간': {},
            '위도': None, '경도': None, '리뷰': [], '주소': '', '전화번호': '',
            '메뉴_정보': [], '음식점_태그': []
        }

    def crawl_restaurant(self, search_query: str) -> Optional[Dict]:
        """Crawl restaurant information and save to CSV"""
        try:
            local = " 강남구"
            google_info = self.search_google_maps(search_query + local)
            dining_info = self.search_diningcode(google_info['음식점_이름'] + local)
            restaurant_info = {**dining_info, **google_info}
            
            self.restaurants_df = pd.concat([
                self.restaurants_df,
                pd.DataFrame([restaurant_info])
            ], ignore_index=True)
            
            self.save_to_csv()
            logger.info(f"Successfully crawled and saved: {search_query}")
            return restaurant_info
        except Exception as e:
            logger.error(f"Failed to crawl {search_query}: {e}")
            return None

    def save_to_csv(self, filename: Optional[str] = None) -> None:
        """Save data to CSV file"""
        save_path = filename or self.config.CSV_PATH
        mode = 'a' if os.path.exists(save_path) else 'w'
        header = not os.path.exists(save_path)
        
        self.restaurants_df.iloc[[-1]].to_csv(
            save_path, mode=mode, header=header, index=False, encoding='utf-8-sig'
        )
        logger.info(f"Data saved to {save_path}")

    def close(self) -> None:
        """Close the WebDriver"""
        try:
            self.driver.quit()
            logger.info("WebDriver closed successfully")
        except Exception as e:
            logger.error(f"Error closing WebDriver: {e}")

def main():
    """Main execution function"""
    try:
        restaurants_df = pd.read_csv('../../data/interim/gangnam_restaurants_cleaned.csv')
        crawler = RestaurantCrawler(headless=False)
        
        for idx, row in restaurants_df.iloc[4150:].iterrows():
            name = row['음식점명']
            logger.info(f"Processing restaurant {idx}: {name}")
            
            try:
                crawler.crawl_restaurant(name)
            except Exception as e:
                logger.error(f"Error processing {name}: {e}")
                continue
                
        logger.info("Crawling completed successfully")
    except Exception as e:
        logger.error(f"Main execution error: {e}")
    finally:
        crawler.close()

if __name__ == "__main__":
    main()