In [1]:
# scopus_collector.py
import requests
import pandas as pd
from datetime import datetime
import time
import logging
import os
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Create necessary directories
DATA_DIR = '/Users/adamgeorghiou/Desktop/GIM/Project/data/raw'
os.makedirs(DATA_DIR, exist_ok=True)

# Set up logging
logging.basicConfig(
    filename='scopus_collection.log',
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

class ScopusCollector:
    def __init__(self):
        self.data = []
        self.api_key = os.getenv('SCOPUS_API_KEY')
        self.base_url = "https://api.elsevier.com/content/search/scopus"
        self.headers = {
            'X-ELS-APIKey': self.api_key,
            'Accept': 'application/json'
        }
        
    def search_papers(self, num_results=50):
        """
        Search Scopus for graphene-related papers
        """
        if not self.api_key:
            print("Scopus API key not found. Please set SCOPUS_API_KEY environment variable.")
            return
            
        print(f"Searching Scopus for: graphene applications")
        print(f"Using API key: {self.api_key[:5]}...")
        
        try:
            # Scopus allows up to 25 results per query
            batch_size = 25
            for start in range(0, num_results, batch_size):
                params = {
                    'query': 'TITLE-ABS-KEY(graphene applications)',
                    'count': min(batch_size, num_results - start),
                    'start': start,
                    'sort': '-coverDate',
                    'view': 'COMPLETE'  # Get complete record information
                }
                
                print(f"\nFetching batch {start//batch_size + 1} (records {start+1} to {start+batch_size})...")
                
                response = requests.get(
                    self.base_url,
                    headers=self.headers,
                    params=params
                )
                
                if response.status_code == 200:
                    data = response.json()
                    entries = data.get('search-results', {}).get('entry', [])
                    total_results = int(data.get('search-results', {}).get('opensearch:totalResults', 0))
                    
                    print(f"Found {len(entries)} papers in this batch (Total available: {total_results})")
                    
                    for entry in entries:
                        try:
                            # Extract authors (if available)
                            authors = entry.get('author', [])
                            author_names = [f"{author.get('given-name', '')} {author.get('surname', '')}" 
                                          for author in authors]
                            
                            paper_data = {
                                'title': entry.get('dc:title', ''),
                                'authors': '; '.join(author_names),
                                'abstract': entry.get('dc:description', ''),
                                'published_date': entry.get('prism:coverDate', ''),
                                'doi': entry.get('prism:doi', ''),
                                'journal': entry.get('prism:publicationName', ''),
                                'volume': entry.get('prism:volume', ''),
                                'issue': entry.get('prism:issueIdentifier', ''),
                                'pages': entry.get('prism:pageRange', ''),
                                'citations': int(entry.get('citedby-count', 0)),
                                'keywords': entry.get('authkeywords', ''),
                                'url': entry.get('prism:url', ''),
                                'collection_date': datetime.now().isoformat(),
                                'source': 'Scopus'
                            }
                            
                            self.data.append(paper_data)
                            print(f"Collected: {paper_data['title'][:100]}...")
                            
                        except Exception as e:
                            print(f"Error processing paper: {str(e)}")
                            continue
                            
                elif response.status_code == 401:
                    print("Authentication Error (401). Please verify your API key is correct and active")
                    print(f"Response: {response.text}")
                    break
                else:
                    print(f"Error: API returned status code {response.status_code}")
                    print(f"Response: {response.text}")
                    break
                    
                # Rate limiting - Scopus allows 6 requests per second
                time.sleep(0.2)
                
        except Exception as e:
            print(f"Error in search: {str(e)}")
            logging.error(f"Error in search: {str(e)}")
            
    def save_data(self, filename='scopus_papers.csv'):
        """
        Save collected data to CSV
        """
        if not self.data:
            print("No data to save!")
            return None
            
        try:
            df = pd.DataFrame(self.data)
            output_path = os.path.join(DATA_DIR, filename)
            df.to_csv(output_path, index=False)
            print(f"Successfully saved {len(self.data)} papers to {output_path}")
            
            # Save a summary version with key fields
            summary_df = df[['title', 'authors', 'published_date', 'journal', 'citations', 'url']]
            summary_path = os.path.join(DATA_DIR, 'scopus_papers_summary.csv')
            summary_df.to_csv(summary_path, index=False)
            print(f"Saved summary version to {summary_path}")
            
            return df
            
        except Exception as e:
            print(f"Error saving data: {str(e)}")
            return None
            
    def analyze_data(self, df):
        """
        Analyze the collected data
        """
        if df is None or df.empty:
            return
            
        print("\nCollection Summary:")
        print(f"Total papers collected: {len(df)}")
        
        # Date range
        df['published_date'] = pd.to_datetime(df['published_date'])
        print(f"Date range: {df['published_date'].min().date()} to {df['published_date'].max().date()}")
        
        # Citation statistics with error handling
        print(f"\nCitation Statistics:")
        try:
            # Clean citations data - remove any non-numeric values
            df['citations'] = pd.to_numeric(df['citations'], errors='coerce')
            df['citations'] = df['citations'].fillna(0).astype(int)
            
            total_citations = df['citations'].sum()
            avg_citations = df['citations'].mean()
            
            print(f"Total citations: {total_citations:,}")
            print(f"Average citations per paper: {avg_citations:.2f}")
            
            if len(df) > 0:
                most_cited_idx = df['citations'].idxmax()
                most_cited_title = df.loc[most_cited_idx, 'title']
                most_cited_count = df.loc[most_cited_idx, 'citations']
                print(f"Most cited paper: {most_cited_title} ({most_cited_count:,} citations)")
        except Exception as e:
            print(f"Error processing citation statistics: {str(e)}")
        
        # Journal statistics
        print(f"\nTop Journals:")
        print(df['journal'].value_counts().head())
        
        # Author statistics
        all_authors = [author.strip() for authors in df['authors'].str.split(';') for author in authors if author.strip()]
        top_authors = pd.Series(all_authors).value_counts().head()
        print(f"\nTop Authors:")
        print(top_authors)

def main():
    collector = ScopusCollector()
    
    print("Starting Scopus data collection...")
    collector.search_papers(num_results=50)
    
    print("\nSaving data...")
    df = collector.save_data()
    
    if df is not None:
        collector.analyze_data(df)

if __name__ == "__main__":
    main()

Starting Scopus data collection...
Searching Scopus for: graphene applications
Using API key: 4bbbd...

Fetching batch 1 (records 1 to 25)...
Found 25 papers in this batch (Total available: 94883)
Collected: Three different methods for ZnO-RGO nanocomposite synthesis and its adsorption capacity for methylen...
Collected: Zinc oxide nanoparticles decorated nitrogen doped porous reduced graphene oxide-based hybrid to sens...
Collected: Two-dimensional anion-rich NaCl<inf>2</inf> crystal under ambient conditions...
Collected: Antimicrobial properties of Graphene sheets embedded with Titanium Oxide and Calcium Oxide nanoparti...
Collected: Fabrication of promising competitive graphene nanocomposite transducer to determine Prucalopride suc...
Collected: Retraction Note: Functionalized graphene oxide nanosheets with folic acid and silk fibroin as a nove...
Collected: Significance of Marangoni convection in ethylene glycol base hybrid nanofluid flow with viscous diss...
Collected: Amelioratio