# 1. Data Collection

### Import Required Libraries
Import the necessary libraries, including pandas and json.

In [None]:
# Import the necessary libraries, including pandas and json
import pandas as pd
import json
import os

### Please change year to the year you want

In [54]:
# Change year to the year you want
year = 2021

In [55]:
# Define the path to the folder containing files
folder_path = f'Project/{year}'

# Loop through all files in the folder
for filename in os.listdir(folder_path):
    if filename.startswith(str(year)) and not filename.endswith('.json'):
        old_file_path = os.path.join(folder_path, filename)
        new_file_path = os.path.join(folder_path, filename + '.json')
        
        # Rename the file
        os.rename(old_file_path, new_file_path)
        # print(f"Renamed {old_file_path} to {new_file_path}")

print("Renaming completed.")

Renaming completed.


### Load JSON File
Load the JSON file from the specified path.

In [62]:
# Path to the folder containing JSON files
folder_path = f'Project/{year}'

# Initialize a list to store the rows of the DataFrame
data_rows = []

# Columns to extract
columns_to_keep = [
    'coredata.srctype',
    'coredata.eid',
    'coredata.dc:description',
    'coredata.pubmed-id',
    'coredata.prism:coverDate',
    'coredata.prism:aggregationType',
    'coredata.prism:url',
    'coredata.source-id',
    'coredata.pii',
    'coredata.citedby-count',
    'coredata.prism:volume',
    'coredata.subtype',
    'coredata.dc:title',
    'coredata.openaccess',
    'coredata.prism:issn',
    'coredata.publishercopyright',
    'coredata.article-number',
    'coredata.subtypeDescription',
    'coredata.prism:publicationName',
    'coredata.prism:doi',
    'coredata.dc:identifier',
    'coredata.dc:publisher',
    'item.bibrecord.head.citation-info.citation-language.@language',
]

# Helper function to safely extract a value from a dictionary
def safe_get(d, keys, default=None):
    try:
        for key in keys:
            d = d.get(key, {})
        return d if d else default
    except AttributeError:
        return default

# Iterate through each file in the folder
for file_name in os.listdir(folder_path):
    if file_name.startswith(str(year)) and file_name.endswith(".json"):
        file_path = os.path.join(folder_path, file_name)
        with open(file_path, 'r', encoding='utf-8') as file:
            try:
                data = json.load(file)

                # Extract countries from author-group
                author_groups = safe_get(data, ['abstracts-retrieval-response', 'item', 'bibrecord', 'head', 'author-group'], [])
                countries = [
                    safe_get(author, ['affiliation', 'country'], 'Unknown')
                    for author in author_groups
                ]
                countries_string = ','.join(countries)

                # Extract keywords
                auth_keywords = safe_get(data, ['abstracts-retrieval-response', 'authkeywords', 'author-keyword'], [])
                keywords = [keyword.get('$', '') for keyword in auth_keywords if isinstance(keyword, dict)]
                keywords_string = ','.join(keywords) if keywords else 'null'

                # Normalize JSON data and filter columns
                row_data = pd.json_normalize(data.get('abstracts-retrieval-response', {}))
                row = {col: row_data[col].iloc[0] if col in row_data else pd.NA for col in columns_to_keep}

                # Add processed fields
                row['item.bibrecord.head.author-group.affiliation.country'] = countries_string
                row['authkeywords.author-keyword'] = keywords_string

                # Append the row to data_rows
                data_rows.append(row)

            except Exception as e:
                print(f"Error processing file {file_name}: {e}")

# Convert the list of rows into a DataFrame
df = pd.DataFrame(data_rows)

# Save the DataFrame to a CSV file
output_path = f'output_{year}.csv'
df.to_csv(output_path, index=False, encoding='utf-8')



In [63]:
df.describe()

Unnamed: 0,coredata.srctype,coredata.eid,coredata.dc:description,coredata.pubmed-id,coredata.prism:coverDate,coredata.prism:aggregationType,coredata.prism:url,coredata.source-id,coredata.pii,coredata.citedby-count,...,coredata.publishercopyright,coredata.article-number,coredata.subtypeDescription,coredata.prism:publicationName,coredata.prism:doi,coredata.dc:identifier,coredata.dc:publisher,item.bibrecord.head.citation-info.citation-language.@language,item.bibrecord.head.author-group.affiliation.country,authkeywords.author-keyword
count,3815,3815,3692,1406,3815,3815,3815,3815,818,3815,...,3683,1685,3815,3815,3623,3815,3815,3814,3815,3815.0
unique,5,3815,3690,1406,255,5,3815,1821,818,99,...,1223,1604,11,1828,3621,3815,454,4,1359,3052.0
top,j,2-s2.0-85128088236,The efficacy of the public health measures to ...,35061442,2021-01-01,Journal,https://api.elsevier.com/content/abstract/scop...,21100200805,S0959652621037604,0,...,"© 2021 by the authors. Licensee MDPI, Basel, S...",114,Article,Scientific Reports,10.3390/s21051880,SCOPUS_ID:85128088236,Elsevier Ltd,English,"Thailand,Thailand",
freq,3519,1,2,1,882,3519,1,111,1,680,...,273,5,3197,109,2,1,300,3805,476,757.0


In [64]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3815 entries, 0 to 3814
Data columns (total 25 columns):
 #   Column                                                         Non-Null Count  Dtype 
---  ------                                                         --------------  ----- 
 0   coredata.srctype                                               3815 non-null   object
 1   coredata.eid                                                   3815 non-null   object
 2   coredata.dc:description                                        3692 non-null   object
 3   coredata.pubmed-id                                             1406 non-null   object
 4   coredata.prism:coverDate                                       3815 non-null   object
 5   coredata.prism:aggregationType                                 3815 non-null   object
 6   coredata.prism:url                                             3815 non-null   object
 7   coredata.source-id                                             3815 n

In [65]:
df.head(3)

Unnamed: 0,coredata.srctype,coredata.eid,coredata.dc:description,coredata.pubmed-id,coredata.prism:coverDate,coredata.prism:aggregationType,coredata.prism:url,coredata.source-id,coredata.pii,coredata.citedby-count,...,coredata.publishercopyright,coredata.article-number,coredata.subtypeDescription,coredata.prism:publicationName,coredata.prism:doi,coredata.dc:identifier,coredata.dc:publisher,item.bibrecord.head.citation-info.citation-language.@language,item.bibrecord.head.author-group.affiliation.country,authkeywords.author-keyword
0,b,2-s2.0-85128088236,,,2021-12-31,Book,https://api.elsevier.com/content/abstract/scop...,21101084653,,0,...,,,Book Chapter,Catheter Ablation of Cardiac Arrhythmias in Ch...,10.1201/9781003082101-25,SCOPUS_ID:85128088236,CRC Press,English,"Spain,Spain,Thailand",
1,j,2-s2.0-85122493175,We construct a Lagrangian for general nonlinea...,35061442.0,2021-12-31,Journal,https://api.elsevier.com/content/abstract/scop...,29150,,9,...,© 2021 authors. Published by the American Phys...,271601.0,Article,Physical Review Letters,10.1103/PhysRevLett.127.271601,SCOPUS_ID:85122493175,American Physical Society,English,"United States,Russian Federation,Thailand,Belg...",
2,j,2-s2.0-85127926459,,35390937.0,2021-12-30,Journal,https://api.elsevier.com/content/abstract/scop...,4700151916,,0,...,,,Article,Zootaxa,10.11646/zootaxa.5086.1.2,SCOPUS_ID:85127926459,NLM (Medline),English,"Thailand,New Zealand,Unknown",


# 2. Data Engineering

### Data Cleansing

In [105]:
# Select the columns to keep and rename them
selected_columns = {
    'coredata.prism:coverDate': 'Year',
    'coredata.dc:title': 'Title',
    'coredata.prism:publicationName': 'PublicationName',
    'item.bibrecord.head.citation-info.citation-language.@language': 'Language',
    'coredata.citedby-count': 'CitedByCount',
    'item.bibrecord.head.author-group.affiliation.country': 'AffiliationCountry',
    'authkeywords.author-keyword': 'AuthorKeywords'
}
df_selected = df[list(selected_columns.keys())].rename(columns=selected_columns)

df_selected['Year'] = df_selected['Year'].str[:4]
df_selected = df_selected.dropna()
df_selected = df_selected[df_selected['AuthorKeywords'] != 'null']
df_selected = df_selected[~df_selected['AffiliationCountry'].str.contains('Unknown')]
df_selected['AffiliationCountry'] = df_selected['AffiliationCountry'].apply(lambda x: ','.join(set(x.split(','))))

df_selected['AffiliationCountry'] = df_selected['AffiliationCountry'].str.split(',')
df_selected['AuthorKeywords'] = df_selected['AuthorKeywords'].str.split(',')

# df_selected['Title'] = df_selected['Title'].astype(str)
# df_selected['PublicationName'] = df_selected['PublicationName'].astype(str)
# df_selected['Language'] = df_selected['Language'].astype(str)
# df_selected['AffiliationCountry'] = df_selected['AffiliationCountry'].astype(str)
df_selected['CitedByCount'] = df_selected['CitedByCount'].astype(int)

df_selected.head(10)

Unnamed: 0,Year,Title,PublicationName,Language,CitedByCount,AffiliationCountry,AuthorKeywords
6,2021,Does proactive logistics management enhance bu...,Polish Journal of Management Studies,English,0,"[Myanmar, Thailand]","[Business management, Firm size, Logistics awa..."
11,2021,"Will There Ever Be Cure for Chronic, Life-Chan...",Frontiers in Medicine,English,3,"[India, United Kingdom, Thailand, Australia, J...","[biofilm infections, chronic infection, chroni..."
12,2021,Bacterial diversity and potential risk factors...,PeerJ,English,6,"[United States, Thailand]","[Escherichia coli, Risk factors, Salmonella en..."
13,2021,Global Perspectives on Immunization Against SA...,Frontiers in Immunology,English,9,"[Brazil, Spain, Switzerland, United Kingdom, G...","[COVID-19, maternal immunization, maternal vac..."
18,2021,Deep Learning Enables Prostate MRI Segmentatio...,Frontiers in Oncology,English,4,"[China, United States, Thailand, United Kingdom]","[deep attentive neural network, large cohort e..."
19,2021,The Expansion of Lignocellulose Biomass Conver...,Frontiers in Nanotechnology,English,10,"[Thailand, India, United Arab Emirates, Malaysia]","[bioconversion, bioenergy, lignocellulose, nan..."
20,2021,Occupational exposure to hazards and volatile ...,Journal of Cleaner Production,English,5,[Thailand],"[Health risk assessment, Life cycle assessment..."
21,2021,Organic carbon stock and composition in 3.5-m ...,Science of the Total Environment,English,8,"[Thailand, Japan]","[Blue carbon, Carbon pool, Coastal vegetated e..."
22,2021,Shared metabolic and neuroimmune mechanisms un...,Progress in Neuro-Psychopharmacology and Biolo...,English,17,"[Brazil, Australia, Thailand, Bulgaria]","[Major Depressive Disorder, Pathogenesis, Ther..."
23,2021,Translational evidence for the Inflammatory Re...,Progress in Neuro-Psychopharmacology and Biolo...,English,19,"[Bulgaria, Thailand, Australia, India]","[Depression, Immune regulatory, Immune system,..."


In [96]:
df_selected.shape

(2586, 7)

In [81]:
df_selected.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3058 entries, 4 to 3814
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Year                3058 non-null   object
 1   Title               3058 non-null   object
 2   PublicationName     3058 non-null   object
 3   Language            3057 non-null   object
 4   CitedByCount        3058 non-null   int32 
 5   AffiliationCountry  3058 non-null   object
 6   AuthorKeywords      3058 non-null   object
dtypes: int32(1), object(6)
memory usage: 179.2+ KB
