In [52]:
import numpy as np 
import pandas as pd
import requests
from PyPDF2 import PdfReader
from bs4 import BeautifulSoup
from urllib.parse import unquote
import getpass
import re
import tqdm
import os
from fpdf import FPDF
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score

In [2]:
df = pd.read_csv('data.csv')
print(df.head)

<bound method NDFrame.head of                                                  Name  \
0   Recent advances in ammonia synthesis technolog...   
1   Co doping regulating electronic structure of B...   
2   A Prompt Decarbonization Pathway for Shipping:...   
3   Utilization of green ammonia as a hydrogen ene...   
4   Defective TiO2-x for High-Performance Electroc...   
..                                                ...   
77  A comprehensive study of renewable energy sour...   
78  Data-driven probabilistic machine learning in ...   
79  Semi-supervised adversarial discriminative lea...   
80  Model Predictive Current Control of Nine-Phase...   
81  Risk-constrained stochastic scheduling for ene...   

                                                 DOI  
0   http://dx.doi.org/10.1016/j.ijhydene.2022.09.061  
1     http://dx.doi.org/10.1016/j.apcatb.2023.123057  
2            http://dx.doi.org/10.3390/atmos14030584  
3   http://dx.doi.org/10.1016/j.ijhydene.2023.04.073  
4         

In [3]:
# def fetch_abstract(doi):
#     url = f"https://api.crossref.org/works/{doi}"
#     response = requests.get(url)
#     if response.status_code == 200:
#         data = response.json()
#         return data['message'].get('abstract', 'fail')

In [4]:
import csv
import time

ELSEVIER_API_KEY

def get_abstract_from_crossref(doi):
    doi = doi[18:]
    url = f"https://api.crossref.org/works/{doi}"
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        return data['message'].get('abstract')
    return None

def get_abstract_from_elsevier(doi):
    doi = doi[18:]
    url = f"https://api.elsevier.com/content/article/doi/{doi}?httpAccept=application/json"
    headers = {
        "X-ELS-APIKey": ELSEVIER_API_KEY,
        "Accept": "application/json"
    }
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        data = response.json()
        return data.get('full-text-retrieval-response', {}).get('coredata', {}).get('dc:description')
    return None

def get_abstract(doi):
    abstract = get_abstract_from_crossref(doi)
    if abstract:
        return abstract
    else:
        abstract = get_abstract_from_elsevier(doi)
        if abstract:
            return abstract
    return None

papers = []
with open('data.csv', 'r') as file:
    csv_reader = csv.reader(file)
    next(csv_reader)
    for row in csv_reader:
        papers.append({'Name': row[0], 'DOI': row[1]})

for paper in papers:
    paper['abstract'] = get_abstract(paper['DOI'])
    print(f"Fetched abstract for: {paper['Name']}")
    time.sleep(2) 

with open('papers_with_abstracts.csv', 'w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(['Name','abstract'])
    for paper in papers:
        writer.writerow([paper['Name'], paper['abstract']])

print("Done! Check papers_with_abstracts.csv for results.")

Fetched abstract for: Recent advances in ammonia synthesis technologies: Toward future zero carbon emissions
Fetched abstract for: Co doping regulating electronic structure of Bi2MoO6 to construct dual active sites for photocatalytic nitrogen fixation
Fetched abstract for: A Prompt Decarbonization Pathway for Shipping: Green Hydrogen, Ammonia, and Methanol Production and Utilization in Marine Engines
Fetched abstract for: Utilization of green ammonia as a hydrogen energy carrier for decarbonization in spark ignition engines
Fetched abstract for: Defective TiO2-x for High-Performance Electrocatalytic NO Reduction toward Ambient NH3 Production
Fetched abstract for: Techno-economic assessment of green hydrogen and ammonia production from wind and solar energy in Iran
Fetched abstract for: Estimating global production and supply costs for green hydrogen and hydrogen-based green energy commodities
Fetched abstract for: A Perspective on the Overarching Role of Hydrogen, Ammonia, and Methanol

In [5]:
df2 = pd.read_csv('papers_with_abstracts.csv')

In [6]:
df2.loc[df2['Name'] == 'Recent advances in ammonia synthesis technologies: Toward future zero carbon emissions']['abstract']

0    \n                  As a carbon-free molecule,...
Name: abstract, dtype: object

In [7]:
df2['abstract'].value_counts()

abstract
Photovoltaic (PV) technology, as a low-carbon energy technology, is crucial to mitigating climate change and achieving sustainable development. China has the largest total number of PV technology patents in the world, but the lack of core technologies has restricted the further innovative development of China's PV industry. Therefore, it is necessary to clarify China's current PV technology accumulation to better catch up with key technology areas. To clearly describe the structural characteristics of China's PV technology innovation network, this study uses China's patent PV technology data over the past 20 years from the Incopat global patent database and analyses the structural characteristics of the network from the perspectives of one-mode and two-mode networks, using method of social network analysis (SNA). The results show that 1) the leading PV enterprises have basically formed relatively stable internal collaborations and that the scale of innovation network developme

In [8]:
df2 = df2.dropna(subset=['abstract'])
df2 = df2.reset_index(drop=True)

In [230]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/arpanmukhopadhyay/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/arpanmukhopadhyay/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/arpanmukhopadhyay/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [468]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))


def preprocess_text(text):
    if pd.isna(text):
        return None

    text = text.lower()
    
    text = re.sub(r'<jats:[^>]+>|<\/jats:[^>]+>', '', text)
    text = re.sub(r'[/\-]', ' ', text)
    text = re.sub(r'abstract',' ',text)
    text = re.sub(r'\S*@\S*\s?', '', text)
    text = re.sub(r'\b(?!\d{4}\b)\d+\b', '', text)
    text = re.sub(r'[^\w\s\[\]]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()

    tokens = word_tokenize(text)
    
    tokens = [token for token in tokens if token not in stop_words]
    
    processed_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    return ' '.join(processed_tokens)

df2['processed abstracts'] = df2['abstract'].apply(preprocess_text)

In [469]:
df2.iloc[4]['abstract']

'<jats:title>Abstract</jats:title><jats:p>Synthesis of green ammonia (NH<jats:sub>3</jats:sub>) via electrolysis of nitric oxide (NO) is extraordinarily sustainable, but multielectron/proton‐involved hydrogenation steps as well as low concentrations of NO can lead to poor activities and selectivities of electrocatalysts. Herein, it is reported that oxygen‐defective TiO<jats:sub>2</jats:sub> nanoarray supported on Ti plate (TiO<jats:sub>2−</jats:sub><jats:italic><jats:sub>x</jats:sub></jats:italic>/TP) behaves as an efficient catalyst for NO reduction to NH<jats:sub>3</jats:sub>. In 0.2\xa0<jats:sc>m</jats:sc> phosphate‐buffered electrolyte, such TiO<jats:sub>2−</jats:sub><jats:italic><jats:sub>x</jats:sub></jats:italic>/TP shows competitive electrocatalytic NH<jats:sub>3</jats:sub> synthesis activity with a maximum NH<jats:sub>3</jats:sub> yield of 1233.2\xa0µg\xa0h<jats:sup>−1</jats:sup>\xa0cm<jats:sup>−2</jats:sup> and Faradaic efficiency of 92.5%. Density functional theory calculati

In [470]:
df2.iloc[4]['processed abstracts']

'synthesis green ammonia nh3 via electrolysis nitric oxide extraordinarily sustainable multielectron protoninvolved hydrogenation step well low concentration lead poor activity selectivity electrocatalysts herein reported oxygendefective tio2 nanoarray supported ti plate tio2x tp behaves efficient catalyst reduction nh3 phosphatebuffered electrolyte tio2x tp show competitive electrocatalytic nh3 synthesis activity maximum nh3 yield 1233 µg h cm faradaic efficiency density functional theory calculation thermodynamically faster deoxygenation protonation process tio2x compared perfect tio2 low energy barrier ev tio2x potentialdetermining step highlight greatly improved intrinsic activity addition znno battery fabricated tio2x tp zn plate obtain nh3 yield µg h cm providing peak power density mw cm'

In [471]:
df2['processed abstracts']

0     carbon free molecule ammonia gained great glob...
1     although photocatalytic nitrogen reduction rea...
2     shipping industry reached higher level maturit...
3     rising concern dependence modern energy system...
4     synthesis green ammonia nh3 via electrolysis n...
                            ...                        
61    unpredictable nature renewable energy source p...
62    renewable energy key element sustainable envir...
63    current trend indicates energy demand supply e...
64    wind turbine play crucial role renewable energ...
65    research introduces stochastic scheduling appr...
Name: processed abstracts, Length: 66, dtype: object

In [472]:
from sklearn.feature_extraction.text import TfidfVectorizer

def vectorize_texts(texts):
    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform(texts)
    terms = vectorizer.get_feature_names_out()
    return vectors,terms

vectors, terms = vectorize_texts(df2['processed abstracts'])

print(vectors.shape)

(66, 2476)
