In [1]:
import requests
import pandas as pd
from datetime import datetime
import time
import logging
import os

# Create necessary directories
DATA_DIR = '/Users/adamgeorghiou/Desktop/GIM/Project/data/raw/semantic_scholar'
os.makedirs(DATA_DIR, exist_ok=True)

# Set up logging
logging.basicConfig(
    filename='data_collection.log',
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

class GrapheneResearchCollector:
    def __init__(self):
        self.data = []
        self.base_url = "https://api.semanticscholar.org/graph/v1/paper/search"
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        
    def search_papers(self, num_results=100):
        """
        Search Semantic Scholar for graphene-related papers
        """
        query = "graphene applications"
        print(f"Searching for: {query}")
        
        try:
            # Parameters for the API request
            params = {
                'query': query,
                'limit': min(num_results, 100),  # API limit is 100 per request
                'fields': 'title,abstract,year,citationCount,authors,url'
            }
            
            response = requests.get(
                self.base_url,
                params=params,
                headers=self.headers
            )
            
            if response.status_code == 200:
                papers = response.json().get('data', [])
                print(f"Found {len(papers)} papers")
                
                for paper in papers:
                    try:
                        # Extract author names
                        authors = [author.get('name', '') for author in paper.get('authors', [])]
                        
                        paper_data = {
                            'title': paper.get('title', 'No title'),
                            'authors': ', '.join(authors),
                            'year': paper.get('year'),
                            'abstract': paper.get('abstract', 'No abstract'),
                            'citations': paper.get('citationCount', 0),
                            'url': paper.get('url', 'No URL'),
                            'collection_date': datetime.now().isoformat(),
                            'source': 'Semantic Scholar'
                        }
                        
                        self.data.append(paper_data)
                        print(f"Successfully collected paper: {paper_data['title'][:100]}...")
                        
                    except Exception as e:
                        print(f"Error processing paper: {str(e)}")
                        continue
                        
            else:
                print(f"Error: API returned status code {response.status_code}")
                print(f"Response: {response.text}")
                
        except Exception as e:
            print(f"Error in search: {str(e)}")
            logging.error(f"Error in search: {str(e)}")
            
    def save_data(self, filename='graphene_papers.csv'):
        """
        Save collected data to CSV
        """
        if not self.data:
            print("No data to save!")
            return
            
        try:
            df = pd.DataFrame(self.data)
            output_path = os.path.join(DATA_DIR, filename)
            df.to_csv(output_path, index=False)
            print(f"Successfully saved {len(self.data)} papers to {output_path}")
        except Exception as e:
            print(f"Error saving data: {str(e)}")
        
    def analyze_initial_data(self):
        """
        Basic analysis of collected data
        """
        if not self.data:
            return {"status": "No data collected"}
            
        try:
            df = pd.DataFrame(self.data)
            summary = {
                'total_papers': len(df),
                'citations_available': df['citations'].notna().sum(),
                'total_citations': df['citations'].fillna(0).sum(),
                'years_available': df['year'].notna().sum()
            }
            
            # Add year range if available
            years = df['year'].dropna()
            if not years.empty:
                summary['year_range'] = f"{int(years.min())} - {int(years.max())}"
            
            return summary
            
        except Exception as e:
            print(f"Error in analysis: {str(e)}")
            return {"error": str(e)}

def main():
    collector = GrapheneResearchCollector()
    
    print("Starting data collection...")
    collector.search_papers(num_results=10)  # Start with just 10 papers for testing
    
    print("\nSaving data...")
    collector.save_data()
    
    print("\nAnalyzing results...")
    summary = collector.analyze_initial_data()
    print("Data collection summary:", summary)

if __name__ == "__main__":
    main()

Starting data collection...
Searching for: graphene applications
Error: API returned status code 429
Response: {"message": "Too Many Requests. Please wait and try again or apply for a key for higher rate limits. https://www.semanticscholar.org/product/api#api-key-form", "code": "429"}

Saving data...
No data to save!

Analyzing results...
Data collection summary: {'status': 'No data collected'}
