In [2]:
import os
import traceback
import requests
from bs4 import BeautifulSoup
from pybliometrics.scopus import ScopusSearch, AbstractRetrieval
import json
from tqdm import tqdm
import pandas as pd
from datetime import datetime, timedelta
import pybliometrics
pybliometrics.scopus.init()
import arxiv
import pandas as pd
from openai import OpenAI
from habanero import Crossref
import warnings
warnings.filterwarnings("ignore")

# Get today's date and the date one month ago
end_date = datetime.today().strftime('%Y-%m-%d')
start_date = (datetime.today() - timedelta(days=30)).strftime('%Y-%m-%d')


def retrieve_scopus_data():
    # Define the query
    query = f'''(TITLE-ABS-KEY(Africa OR Nigeria OR Kenya OR Ghana OR South Africa OR Liberia OR Egypt OR Lagos OR Abuja OR Morocco OR Rwanda OR Senegal OR "Sub-Saharan Africa")) 
            AND (DOCTYPE(AR)) 
            AND (SRCTYPE(j)) 
            AND (ORIG-LOAD-DATE > {start_date} AND ORIG-LOAD-DATE < {end_date})'''

    # Perform Scopus search
    x = ScopusSearch(query=query, view="STANDARD", cursor=None, verbose=True)
    
    # Extract results
    scopus_data = []
    for result in x.results:
        scopus_data.append({
            "Title": result.title,
            "Author": result.creator,
            "Publication_Year": result.coverDate,
            "Link": f"http://dx.doi.org/{result.doi}" if result.doi else "No DOI available"
        })
    
    scopus_df = pd.DataFrame(scopus_data)

    # Ensure only results within the date range
    scopus_df = scopus_df[
        (scopus_df["Publication_Year"] >= start_date) & 
        (scopus_df["Publication_Year"] <= end_date)
    ].reset_index(drop=True)

    # Fetch abstracts using Crossref from Habanero
    cr = Crossref()
    abstracts = []
    
    for link in scopus_df["Link"]:
        if "No DOI available" in link:
            abstracts.append("No abstract available")
            continue
        
        doi = link.split("doi.org/")[-1]
        try:
            paper = cr.works(ids=doi)
            abstract_raw = paper["message"].get("abstract", "No abstract available")
            soup = BeautifulSoup(abstract_raw, "html.parser")
            abstracts.append(soup.get_text())
        except Exception as e:
            abstracts.append("No abstract available")  # Handle errors gracefully

    scopus_df["Abstract"] = abstracts

    # Remove rows with no abstracts
    scopus_df = scopus_df[scopus_df["Abstract"] != "No abstract available"]

    # return the dataframe
    return scopus_df

def retrieve_arxiv_data(max_results=30):
    
    # Define the list of keywords to search for
    keywords = ["Africa", "Nigeria", "Kenya", "Ghana", "South Africa", "Liberia", "Egypt", "Lagos", "Abuja", "Morocco", "Rwanda", "Senegal", "Sub-Saharan Africa"]

    # Construct the OR-based search query
    query = " OR ".join(keywords)

    search = arxiv.Search(
        query=query,
        max_results=max_results,
        sort_by=arxiv.SortCriterion.SubmittedDate
    )

    new_data = []
    for result in search.results():
        new_data.append({
          "Title": result.title,
          "Author": result.authors,
          "Publication_Year": str(result.published.date()),
          "Link": result.pdf_url,
          "Abstract": result.summary
        })

    arxiv_df = pd.DataFrame(new_data)

    # Filter out the Publication Date
    arxiv_df["Publication_Year"].astype(str)
    arxiv_df = arxiv_df[(arxiv_df["Publication_Year"] >= str(start_date)) & (arxiv_df["Publication_Year"] <= str(end_date))]

    # Reset the index of the Dataframe
    arxiv_df = arxiv_df.reset_index(drop=True)

    # Return the dataframe
    return arxiv_df

# scopus_df = retrieve_scopus_data()
# arxiv_df = retrieve_arxiv_data()
def merge_dataframes():
    scopus_df = retrieve_scopus_data()
    arxiv_df = retrieve_arxiv_data()  
    df = pd.concat([scopus_df, arxiv_df], axis=0)

    df.reset_index(drop=True)
    return df

def categorize_topics():
    df = merge_dataframes()
    # Set up OpenAI API client
    client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

    topic_category = []
    for title in df["Title"]:
        # Query OpenAI API for topic categorization
        completion = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {
                    "role": "system",
                    "content": "You are an academic assistant. Given a research paper title, classify it into a relevant academic category. Your response should only contain the category name."
                },
                {
                    "role": "user",
                    "content": f"Categorize the research paper titled: '{title}'. Only return the topic category."
                }
            ],
            temperature=0  # Ensures consistency in responses
        )

        # Extract and print topic categorization
        topic_categorization = completion.choices[0].message.content.strip()
        topic_category.append(topic_categorization)

    # Merge topic categories to the dataframe
    df["Topic_Category"] = topic_category

    # Reset the index of the dataframe
    df.reset_index(drop=True, inplace=True)

    # Return the dataframe with topic categories
    return df

def generate_newsletter():
    # Load the dataframe
    df = categorize_topics()

    # Format newsletter content
    client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

    newsletter_content = "Research Newsletter\n\n"
    newsletter_content += "Welcome to this edition of our research newsletter, where we summarize notable research articles across various fields.\n\n"

    count = 1

    for topic in df["Topic_Category"].unique():
        categorized_df = df[df["Topic_Category"] == topic]

        # Extract information as lists
        titles = categorized_df["Title"].tolist()
        authors = categorized_df["Author"].tolist()
        abstracts = categorized_df["Abstract"].tolist()
        links = categorized_df["Link"].tolist()

        # Format input for OpenAI
        research_papers = "\n".join([f"- **{t}** by {a} ([Link]({l}))" for t, a, l in zip(titles, authors, links)])
        abstracts_text = "\n".join([f"{i+1}. {ab}" for i, ab in enumerate(abstracts)])

        completion = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {
                    "role": "system",
                    "content": """
                        You are an academic assistant summarizing research papers for a newsletter. 
                        Given a set of research titles, authors, abstracts, and links, 
                        create a concise yet comprehensive summary for each topic category.
                        
                        - The summary should be **two paragraphs long**.
                        - Incorporate key insights from all research papers under the category.
                        - Ensure clarity and engagement while maintaining academic rigor.
                        - Provide a smooth transition between key ideas.
                        - Use simple, professional language.
                    """
                },
                {
                    "role": "user",
                    "content": f"""
                        **Topic Category:** {topic}

                        **Research Papers:**
                        {research_papers}

                        **Abstracts:**
                        {abstracts_text}

                        Generate a well-structured summary that highlights the key insights from these research papers in two paragraphs.
                    """
                }
            ],
            temperature=0  # Ensures consistency
        )

        # Extract and append to the newsletter
        section_summary = completion.choices[0].message.content.strip()
        newsletter_content += f"{count}) {topic}\n\n{section_summary}\n\n{'==' * 100}\n\n"
        count += 1

    # Print or save the newsletter
    print(newsletter_content)


In [2]:
xx = categorize_topics()

In [None]:
# if __name__ == "__main__":
#     # Call the function to categorize topics
#     df = categorize_topics()

#     # Export dataframe
#     df.to_csv("africa_research_papers.csv")