In [3]:
# ieee_collector.py
import requests
import pandas as pd
from datetime import datetime
import time
import logging
import os
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Create necessary directories
DATA_DIR = '/Users/adamgeorghiou/Desktop/GIM/Project/data/raw'
os.makedirs(DATA_DIR, exist_ok=True)

# Set up logging
logging.basicConfig(
    filename='ieee_collection.log',
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

class IEEECollector:
    def __init__(self):
        self.data = []
        self.api_key = os.getenv('IEEE_API_KEY')
        self.base_url = "http://ieeexploreapi.ieee.org/api/v1/search/articles"
        self.calls_made = 0
        self.last_call_time = None
        
    def _wait_for_rate_limit(self):
        """Implement rate limiting"""
        current_time = time.time()
        
        # If this is not the first call, check the time since last call
        if self.last_call_time is not None:
            elapsed = current_time - self.last_call_time
            if elapsed < 0.1:  # Ensure at least 100ms between calls (10 calls per second limit)
                time.sleep(0.1 - elapsed)
        
        self.last_call_time = time.time()
        self.calls_made += 1
        
        if self.calls_made >= 190:  # Conservative limit
            print("Approaching daily rate limit, stopping collection")
            return False
        return True
        
    def search_papers(self, num_results=50):
        """
        Search IEEE Xplore for graphene-related papers
        """
        if not self.api_key:
            print("IEEE API key not found. Please set IEEE_API_KEY environment variable.")
            return
            
        print(f"Searching IEEE Xplore for: graphene applications")
        print(f"Using API key: {self.api_key[:5]}...")  # Print first 5 chars for verification
        
        try:
            # Split into smaller batches to handle rate limits
            batch_size = 10
            for start in range(0, num_results, batch_size):
                if not self._wait_for_rate_limit():
                    break
                    
                params = {
                    'apikey': self.api_key,
                    'querytext': 'graphene applications',
                    'max_records': min(batch_size, num_results - start),
                    'start_record': start + 1,
                    'sort_order': 'desc',
                    'sort_field': 'publication_year',
                    'abstract': 'true'
                }
                
                print(f"\nFetching batch {start//batch_size + 1} (records {start+1} to {start+batch_size})...")
                
                response = requests.get(self.base_url, params=params)
                
                if response.status_code == 200:
                    data = response.json()
                    articles = data.get('articles', [])
                    total_found = data.get('total_records', 0)
                    
                    print(f"Found {len(articles)} papers in this batch")
                    
                    for article in articles:
                        try:
                            authors = article.get('authors', {'authors': []}).get('authors', [])
                            author_names = [f"{author.get('full_name', '')}" for author in authors]
                            
                            paper_data = {
                                'title': article.get('title', ''),
                                'authors': '; '.join(author_names),
                                'abstract': article.get('abstract', ''),
                                'published_date': article.get('publication_date', ''),
                                'doi': article.get('doi', ''),
                                'publisher': article.get('publisher', ''),
                                'document_type': article.get('content_type', ''),
                                'conference': article.get('conference_location', ''),
                                'citations': article.get('citing_paper_count', 0),
                                'keywords': '; '.join(article.get('index_terms', {}).get('ieee_terms', {}).get('terms', [])),
                                'url': f"https://doi.org/{article.get('doi', '')}" if article.get('doi') else '',
                                'collection_date': datetime.now().isoformat(),
                                'source': 'IEEE'
                            }
                            
                            self.data.append(paper_data)
                            print(f"Collected: {paper_data['title'][:100]}...")
                            
                        except Exception as e:
                            print(f"Error processing paper: {str(e)}")
                            continue
                            
                elif response.status_code == 403:
                    print(f"Authentication Error (403). Full response:")
                    print(f"URL: {response.url}")
                    print(f"Response: {response.text}")
                    print("Please verify your API key is correct and active")
                    break
                else:
                    print(f"Error: API returned status code {response.status_code}")
                    print(f"Response: {response.text}")
                    break
                    
                # Wait between batches
                time.sleep(1)
                
        except Exception as e:
            print(f"Error in search: {str(e)}")
            logging.error(f"Error in search: {str(e)}")
            
    def save_data(self, filename='ieee_papers.csv'):
        """
        Save collected data to CSV
        """
        if not self.data:
            print("No data to save!")
            return None
            
        try:
            df = pd.DataFrame(self.data)
            output_path = os.path.join(DATA_DIR, filename)
            df.to_csv(output_path, index=False)
            print(f"Successfully saved {len(self.data)} papers to {output_path}")
            return df
            
        except Exception as e:
            print(f"Error saving data: {str(e)}")
            return None

def main():
    collector = IEEECollector()
    
    print("Starting IEEE data collection...")
    collector.search_papers(num_results=50)
    
    print("\nSaving data...")
    df = collector.save_data()
    
    if df is not None:
        print("\nCollection Summary:")
        print(f"Total papers collected: {len(df)}")
        if len(df) > 0:
            print(f"Date range: {df['published_date'].min()} to {df['published_date'].max()}")
            print(f"Document types: {df['document_type'].value_counts().to_dict()}")

if __name__ == "__main__":
    main()

Starting IEEE data collection...
Searching IEEE Xplore for: graphene applications
Using API key: z3fen...

Fetching batch 1 (records 1 to 10)...
Authentication Error (403). Full response:
URL: http://ieeexploreapi.ieee.org/api/v1/search/articles?apikey=z3fencvfv77wmnwxp9m74ju8&querytext=graphene+applications&max_records=10&start_record=1&sort_order=desc&sort_field=publication_year&abstract=true
Response: <h1>Developer Inactive</h1>
Please verify your API key is correct and active

Saving data...
No data to save!


In [2]:

load_dotenv()
print(f"API Key: {os.getenv('IEEE_API_KEY', 'Not found')}")

API Key: z3fencvfv77wmnwxp9m74ju8
