In [1]:
import urllib.request
import urllib.parse
import json
import pandas as pd
from datetime import datetime
import time
import logging
import os
import xml.etree.ElementTree as ET

# Create necessary directories
DATA_DIR = '/Users/adamgeorghiou/Desktop/GIM/Project/data/raw/arxiv'
os.makedirs(DATA_DIR, exist_ok=True)

# Create logs directory if it doesn't exist
LOGS_DIR = '/Users/adamgeorghiou/Desktop/GIM/Project/logs'
os.makedirs(LOGS_DIR, exist_ok=True)

class GrapheneResearchCollector:
    def __init__(self):
        self.data = []
        self.base_url = "http://export.arxiv.org/api/query"
        
        # Set up logger
        self.logger = logging.getLogger('arxiv_collector')
        self.logger.setLevel(logging.INFO)
        
        # Create file handler
        fh = logging.FileHandler(os.path.join(LOGS_DIR, 'arxiv_collection.log'))
        fh.setLevel(logging.INFO)
        
        # Create formatter
        formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
        fh.setFormatter(formatter)
        
        # Add handler to logger
        self.logger.addHandler(fh)
        
        # Prevent logging from being passed to the root logger
        self.logger.propagate = False
        
    def search_papers(self, num_results=100):
        """
        Search arXiv for graphene-related papers using urllib
        """
        self.logger.info(f"Starting search for {num_results} papers")
        query = urllib.parse.quote("graphene applications")
        print(f"Searching arXiv for: graphene applications")
        
        try:
            # Format the query URL
            query_url = f"{self.base_url}?search_query=all:{query}&start=0&max_results={num_results}"
            
            print("Fetching data from arXiv...")
            with urllib.request.urlopen(query_url) as response:
                response_data = response.read()
            
            # Parse XML response
            root = ET.fromstring(response_data)
            
            # ArXiv XML uses namespaces
            namespace = {'atom': 'http://www.w3.org/2005/Atom',
                        'arxiv': 'http://arxiv.org/schemas/atom'}
            
            entries = root.findall('atom:entry', namespace)
            print(f"Found {len(entries)} papers")
            self.logger.info(f"Found {len(entries)} papers")
            
            for entry in entries:
                try:
                    # Extract data from XML
                    title = entry.find('atom:title', namespace).text.strip().replace('\n', ' ')
                    abstract = entry.find('atom:summary', namespace).text.strip().replace('\n', ' ')
                    published = entry.find('atom:published', namespace).text
                    
                    # Get authors
                    authors = [author.find('atom:name', namespace).text 
                            for author in entry.findall('atom:author', namespace)]
                    
                    # Get categories
                    categories = [cat.get('term') 
                                for cat in entry.findall('atom:category', namespace)]
                    
                    paper_data = {
                        'title': title,
                        'authors': ', '.join(authors),
                        'abstract': abstract,
                        'published_date': published,
                        'url': entry.find('atom:id', namespace).text,
                        'categories': ', '.join(categories),
                        'collection_date': datetime.now().isoformat(),
                        'source': 'arXiv'
                    }
                    
                    self.data.append(paper_data)
                    self.logger.info(f"Collected paper: {title[:100]}")
                    print(f"Successfully collected paper: {title[:100]}...")
                    
                    # Small delay between processing entries
                    time.sleep(0.1)
                    
                except Exception as e:
                    error_msg = f"Error processing paper: {str(e)}"
                    print(error_msg)
                    self.logger.error(error_msg)
                    continue
                    
        except Exception as e:
            error_msg = f"Error in search: {str(e)}"
            print(error_msg)
            self.logger.error(error_msg)
            
    def save_data(self, filename='arxiv_papers.csv'):
        """
        Save collected data to CSV
        """
        if not self.data:
            msg = "No data to save!"
            print(msg)
            self.logger.warning(msg)
            return
            
        try:
            df = pd.DataFrame(self.data)
            output_path = os.path.join(DATA_DIR, filename)
            df.to_csv(output_path, index=False)
            msg = f"Successfully saved {len(self.data)} papers to {output_path}"
            print(msg)
            self.logger.info(msg)
            
            # Save a more readable summary version
            summary_df = df[['title', 'authors', 'published_date', 'categories', 'url']]
            summary_path = os.path.join(DATA_DIR, 'arxiv_papers_summary.csv')
            summary_df.to_csv(summary_path, index=False)
            msg = f"Saved summary version to {summary_path}"
            print(msg)
            self.logger.info(msg)
            
        except Exception as e:
            error_msg = f"Error saving data: {str(e)}"
            print(error_msg)
            self.logger.error(error_msg)
        
    def analyze_initial_data(self):
        """
        Basic analysis of collected data
        """
        if not self.data:
            return {"status": "No data collected"}
            
        try:
            df = pd.DataFrame(self.data)
            
            # Convert published_date to datetime
            df['published_date'] = pd.to_datetime(df['published_date'])
            
            summary = {
                'total_papers': len(df),
                'date_range': f"{df['published_date'].min().date()} - {df['published_date'].max().date()}",
                'unique_categories': len(set([cat for cats in df['categories'].str.split(', ') for cat in cats])),
                'unique_authors': len(set([author for authors in df['authors'].str.split(', ') for author in authors]))
            }
            
            # Get top categories
            all_categories = [cat for cats in df['categories'].str.split(', ') for cat in cats]
            top_categories = pd.Series(all_categories).value_counts().head(5).to_dict()
            summary['top_categories'] = top_categories
            
            self.logger.info(f"Analysis complete: {summary}")
            return summary
            
        except Exception as e:
            error_msg = f"Error in analysis: {str(e)}"
            print(error_msg)
            self.logger.error(error_msg)
            return {"error": str(e)}

def main():
    collector = GrapheneResearchCollector()
    
    print("Starting data collection...")
    collector.search_papers(num_results=50)  # Start with 50 papers
    
    print("\nSaving data...")
    collector.save_data()
    
    print("\nAnalyzing results...")
    summary = collector.analyze_initial_data()
    print("Data collection summary:", summary)

if __name__ == "__main__":
    main()

Starting data collection...
Searching arXiv for: graphene applications
Fetching data from arXiv...
Found 50 papers
Successfully collected paper: Hydrogenated-Graphene encapsulated Graphene: A versatile material for   device applications...
Successfully collected paper: Determining Graphene Adhesion via Substrate-regulated Morphology of   Graphene...
Successfully collected paper: Graphene-plasmon polaritons: From fundamental properties to potential   applications...
Successfully collected paper: Crystal orientation relation and macroscopic surface roughness in   hetero-epitaxially grown graphen...
Successfully collected paper: Quantifying Mn diffusion through transferred versus directly-grown   graphene barriers...
Successfully collected paper: Graphene Field Effect Transistors: A Review...
Successfully collected paper: Growth and electronic structure of graphene on semiconducting Ge(110)...
Successfully collected paper: Graphene MEMS and NEMS...
Successfully collected paper: Three-dime