In [15]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import TimeoutException
import pandas as pd
from datetime import datetime
import time
import logging
import os

# Create necessary directories
DATA_DIR = '/Users/adamgeorghiou/Desktop/GIM/Project/data/raw/patents'
os.makedirs(DATA_DIR, exist_ok=True)

LOGS_DIR = '/Users/adamgeorghiou/Desktop/GIM/Project/logs'
os.makedirs(LOGS_DIR, exist_ok=True)

class GooglePatentCollector:
    def __init__(self):
        self.data = []
        
        # Set up Chrome options
        self.options = webdriver.ChromeOptions()
        self.options.add_argument('--headless')
        self.options.add_argument('--no-sandbox')
        self.options.add_argument('--disable-dev-shm-usage')
        self.options.add_argument('--disable-gpu')
        self.options.add_argument('--window-size=1920,1080')
        
        # Set up logger
        self.logger = logging.getLogger('patent_collector')
        self.logger.setLevel(logging.INFO)
        fh = logging.FileHandler(os.path.join(LOGS_DIR, 'patent_collection.log'))
        fh.setLevel(logging.INFO)
        formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
        fh.setFormatter(formatter)
        self.logger.addHandler(fh)
        self.logger.propagate = False
        
    def setup_driver(self):
        """Set up Chrome driver with error handling"""
        try:
            service = Service(ChromeDriverManager().install())
            driver = webdriver.Chrome(service=service, options=self.options)
            return driver
        except Exception as e:
            error_msg = f"Error setting up Chrome driver: {str(e)}"
            print(error_msg)
            self.logger.error(error_msg)
            return None
            
    def search_patents(self, num_results=50):
      """Search Google Patents for graphene-related patents across multiple pages"""
      self.logger.info(f"Starting search for {num_results} patents")
      driver = None
      
      try:
        driver = self.setup_driver()
        if not driver:
            return
            
        wait = WebDriverWait(driver, 20)
        url = "https://patents.google.com/?q=graphene+applications"
        print(f"Navigating to: {url}")
        driver.get(url)
        
        # Wait for the search results to load
        try:
            wait.until(EC.presence_of_element_located((By.TAG_NAME, "article")))
            print("Search results loaded successfully")
        except TimeoutException:
            print("Timeout waiting for search results to load")
            return
        
        patents_collected = 0
        current_page = 1

        while patents_collected < num_results:
            articles = driver.find_elements(By.TAG_NAME, "article")
            print(f"Found {len(articles)} articles on page {current_page}")
            
            # Process each article on the current page
            for article in articles:
                if patents_collected >= num_results:
                    break
                try:
                    title = article.find_element(By.TAG_NAME, "h3").text.strip()
                    
                    # Get link and extract patent ID from URL
                    link_elem = article.find_element(By.TAG_NAME, "a")
                    link = link_elem.get_attribute("href")
                    patent_id = link.split("/")[-2] if link else "No ID"
                    
                    # Get metadata and publication dates
                    metadata = article.find_elements(By.TAG_NAME, "h4")
                    metadata_text = metadata[0].text if len(metadata) > 0 else "No metadata"
                    dates_text = metadata[1].text if len(metadata) > 1 else "No dates"
                    
                    # Get abstract
                    abstract = article.find_element(By.CLASS_NAME, "abstract").text.strip()
                    
                    patent_data = {
                        'title': title,
                        'patent_id': patent_id,
                        'link': link,
                        'metadata': metadata_text,
                        'dates': dates_text,
                        'abstract': abstract,
                        'collection_date': datetime.now().isoformat()
                    }
                    
                    self.data.append(patent_data)
                    patents_collected += 1
                    print(f"Collected patent {patents_collected}/{num_results}: {title[:100]}...")
                    
                except Exception as e:
                    print(f"Error processing patent: {str(e)}")
                    continue
            
            if patents_collected >= num_results:
                break
            
            # Attempt to go to the next page if available
            next_button = None
            try:
                # First try using the aria-label "Next"
                next_button = driver.find_element(By.XPATH, '//a[@aria-label="Next"]')
            except Exception as e:
                print("Next button not found using aria-label 'Next'. Trying alternative locator...")
                try:
                    # Alternatively, try to locate by visible text containing "Next"
                    next_button = driver.find_element(By.XPATH, '//a[contains(text(), "Next")]')
                except Exception as e:
                    print(f"Error clicking next page or no next page found: {str(e)}")
                    break  # No next page found; exit the loop

            if next_button:
                try:
                    next_button.click()
                    current_page += 1
                    wait.until(EC.presence_of_element_located((By.TAG_NAME, "article")))
                    time.sleep(2)  # Allow time for the new page to load
                except Exception as e:
                    print(f"Error after clicking next page: {str(e)}")
                    break
                    
      except Exception as e:
          print(f"Error in search: {str(e)}")
          self.logger.error(f"Error in search: {str(e)}")
          
      finally:
        if driver:
            driver.quit()

                
    def save_data(self, filename='google_patents.csv'):
        """Save collected data to CSV"""
        if not self.data:
            print("No data to save!")
            return None
            
        try:
            df = pd.DataFrame(self.data)
            output_path = os.path.join(DATA_DIR, filename)
            df.to_csv(output_path, index=False)
            print(f"Successfully saved {len(self.data)} patents to {output_path}")
            
            # Save a summary version
            summary_df = df[['title', 'patent_id', 'dates', 'link']]
            summary_path = os.path.join(DATA_DIR, 'google_patents_summary.csv')
            summary_df.to_csv(summary_path, index=False)
            print(f"Saved summary version to {summary_path}")
            
            return df
            
        except Exception as e:
            print(f"Error saving data: {str(e)}")
            return None

def main():
    collector = GooglePatentCollector()
    
    print("Starting patent collection...")
    collector.search_patents(num_results=50)
    
    print("\nSaving data...")
    df = collector.save_data()
    
    if df is not None:
        print("\nCollection Summary:")
        print(f"Total patents collected: {len(df)}")
        print("\nDate distribution:")
        print(df['dates'].value_counts().head())

if __name__ == "__main__":
    main()


Starting patent collection...
Navigating to: https://patents.google.com/?q=graphene+applications
Search results loaded successfully
Found 10 articles on page 1
Collected patent 1/50: Patterned nano graphene platelet-based conductive inks...
Collected patent 2/50: Supercritical fluid process for producing nano graphene platelets...
Collected patent 3/50: Graphene oxide gel bonded graphene composite films and processes for producing …...
Collected patent 4/50: Pristine and functionalized graphene materials...
Collected patent 5/50: One-step production of graphene materials...
Collected patent 6/50: Process for producing dispersible and conductive nano graphene platelets from …...
Collected patent 7/50: Nano graphene-modified lubricant...
Collected patent 8/50: Mass production of pristine nano graphene materials...
Collected patent 9/50: Dispersible and conductive nano graphene platelets...
Collected patent 10/50: Graphite nanoplatelets for thermal and electrical applications...
Next butt