In [38]:
!pip install scholarly fuzzywuzzy python-docx pandas requests python-Levenshtein lxml openpyxl --quiet

# 1. Import Libraries
import pandas as pd
import requests
import xml.etree.ElementTree as ET
from scholarly import scholarly
from fuzzywuzzy import process
from docx import Document
import time
import logging
from typing import List, Dict
from google.colab import files
import io
import concurrent.futures
import warnings
warnings.filterwarnings("ignore")

In [39]:
# 2. Configuration & Caching
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

CONFIG = {
    "MAX_RETRIES": 3,
    "REQUEST_DELAY": 2.5,
    "DBLP_API": "https://dblp.org/search/publ/api",
    "FUZZY_THRESHOLD": 85,
    "THREAD_POOL_SIZE": 4
}

# 3. Data Fetching Modules
def fetch_dblp_publications(name: str, start_year: int, end_year: int) -> List[Dict]:
    publications = []
    try:
        params = {"q": name, "format": "xml", "h": 1000}
        response = requests.get(CONFIG["DBLP_API"], params=params, timeout=10)
        response.raise_for_status()

        root = ET.fromstring(response.content)
        for pub in root.findall('.//hit'):
            year_elem = pub.find(".//year")
            if year_elem is not None:
                year = int(year_elem.text)
                if start_year <= year <= end_year:
                    publications.append({
                        "title": pub.find(".//title").text if pub.find(".//title") is not None else "",
                        "authors": [a.text for a in pub.findall(".//author")],
                        "year": year,
                        "venue": pub.find(".//venue").text if pub.find(".//venue") is not None else "",
                        "source": "DBLP"
                    })
        logger.info(f"DBLP: Found {len(publications)} publications for {name}")
    except Exception as e:
        logger.error(f"DBLP Error for {name}: {str(e)}")
    return publications

# 4. Data Processing

def fuzzy_match_author(target_name: str, authors: List[str]) -> bool:
    best_match = process.extractOne(target_name, authors)
    return best_match[1] >= CONFIG["FUZZY_THRESHOLD"] if best_match else False

def process_publications(raw_data: List[Dict], faculty_name: str) -> pd.DataFrame:
    processed = []
    for pub in raw_data:
        if fuzzy_match_author(faculty_name, pub["authors"]):
            pub_type = "Conference" if any(kw in pub["venue"].lower() for kw in ["conf", "workshop", "symposium"]) else "Journal"
            processed.append({
                "Title": pub["title"],
                "Authors": ", ".join(pub["authors"]),
                "Year": pub["year"],
                "Venue": pub["venue"],
                "Type": pub_type,
                "Source": pub["source"]
            })
    return pd.DataFrame(processed)

# 5. Export Modules

def export_to_excel(df_dict: Dict[str, pd.DataFrame], filename: str):
    with pd.ExcelWriter(filename, engine='openpyxl') as writer:
        for faculty, df in df_dict.items():
            sheet_name = faculty[:30].replace(":", "").replace("\\", "")
            df.to_excel(writer, sheet_name=sheet_name, index=False)

def export_to_word(df_dict: Dict[str, pd.DataFrame], filename: str):
    doc = Document()
    for faculty, df in df_dict.items():
        doc.add_heading(faculty, level=1)
        summary = doc.add_paragraph()
        summary.add_run("Publication Summary\n").bold = True
        summary.add_run(f"Total Publications: {len(df)}\n")
        summary.add_run(f"Time Period: {df['Year'].min()} - {df['Year'].max()}\n")

        table = doc.add_table(rows=1, cols=5, style="Light Shading")
        headers = ["Title", "Year", "Venue", "Type", "Source"]
        for i, header in enumerate(headers):
            table.cell(0, i).text = header

        for _, row in df.iterrows():
            cells = table.add_row().cells
            cells[0].text = str(row["Title"])
            cells[1].text = str(row["Year"])
            cells[2].text = str(row["Venue"])
            cells[3].text = row["Type"]
            cells[4].text = row["Source"]
    doc.save(filename)

# 6. Main Workflow

def process_faculty_row(row):
    faculty_name = row["Faculty Name"]
    start_year = row["Start Year"]
    end_year = row["End Year"]

    logger.info(f"Processing: {faculty_name}")
    dblp_data = fetch_dblp_publications(faculty_name, start_year, end_year)
    combined_df = process_publications(dblp_data, faculty_name)
    return faculty_name, combined_df


def main_workflow():
    try:
        print("\n📁 Please upload the Excel file with columns: Faculty Name, Start Year, End Year")
        uploaded = files.upload()
        filename = next(iter(uploaded))
        df_input = pd.read_excel(io.BytesIO(uploaded[filename]))

        output_format = input("Enter output format (Excel/Word): ").strip().lower()
        result_dfs = {}

        with concurrent.futures.ThreadPoolExecutor(max_workers=CONFIG["THREAD_POOL_SIZE"]) as executor:
            future_to_name = {executor.submit(process_faculty_row, row): row["Faculty Name"] for _, row in df_input.iterrows()}
            for future in concurrent.futures.as_completed(future_to_name):
                faculty, df = future.result()
                if not df.empty:
                    result_dfs[faculty] = df

        if not result_dfs:
            logger.warning("No publication data was found for any faculty.")
            return

        if output_format == 'excel':
            output_file = "Faculty_Publications.xlsx"
            export_to_excel(result_dfs, output_file)
        elif output_format == 'word':
            output_file = "Faculty_Report.docx"
            export_to_word(result_dfs, output_file)
        else:
            logger.error("Invalid output format. Choose either 'Excel' or 'Word'.")
            return

        print("\n✅ Processing completed. File ready for download.")
        files.download(output_file)

    except Exception as e:
        logger.error(f"Fatal error: {str(e)}")
        raise

# 7. Execution

if __name__ == "__main__":
    main_workflow()


📁 Please upload the Excel file with columns: Faculty Name, Start Year, End Year


Saving Faculty Details.xlsx to Faculty Details.xlsx
Enter output format (Excel/Word): excel

✅ Processing completed. File ready for download.


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>