# Import Required Libraries
Import the necessary libraries, including pandas and json.

In [1]:
# Import the necessary libraries, including pandas and json
import pandas as pd
import json
import os

## Please change year to the year you want

In [2]:
# Change year to the year you want
year = 2021

In [3]:
# Define the path to the folder containing files
folder_path = f'Project/{year}'

# Loop through all files in the folder
for filename in os.listdir(folder_path):
    if filename.startswith(str(year)) and not filename.endswith('.json'):
        old_file_path = os.path.join(folder_path, filename)
        new_file_path = os.path.join(folder_path, filename + '.json')
        
        # Rename the file
        os.rename(old_file_path, new_file_path)
        # print(f"Renamed {old_file_path} to {new_file_path}")

print("Renaming completed.")

Renaming completed.


# Load JSON File
Load the JSON file from the specified path.

In [4]:
# Path to the folder containing JSON files
folder_path = f'Project/{year}'

# Initialize a list to store the rows of the DataFrame
data_rows = []

# Columns to extract
columns_to_keep = [
    'coredata.srctype',
    'coredata.eid',
    'coredata.dc:description',
    'coredata.pubmed-id',
    'coredata.prism:coverDate',
    'coredata.prism:aggregationType',
    'coredata.prism:url',
    'coredata.source-id',
    'coredata.pii',
    'coredata.citedby-count',
    'coredata.prism:volume',
    'coredata.subtype',
    'coredata.dc:title',
    'coredata.openaccess',
    'coredata.prism:issn',
    'coredata.publishercopyright',
    'coredata.article-number',
    'coredata.subtypeDescription',
    'coredata.prism:publicationName',
    'coredata.prism:doi',
    'coredata.dc:identifier',
    'coredata.dc:publisher',
    'item.bibrecord.head.citation-info.citation-language.@language',
]

# Helper function to safely extract a value from a dictionary
def safe_get(d, keys, default=None):
    try:
        for key in keys:
            d = d.get(key, {})
        return d if d else default
    except AttributeError:
        return default

# Iterate through each file in the folder
for file_name in os.listdir(folder_path):
    if file_name.startswith(str(year)) and file_name.endswith(".json"):
        file_path = os.path.join(folder_path, file_name)
        with open(file_path, 'r', encoding='utf-8') as file:
            try:
                data = json.load(file)

                # Extract countries from author-group
                author_groups = safe_get(data, ['abstracts-retrieval-response', 'item', 'bibrecord', 'head', 'author-group'], [])
                countries = [
                    safe_get(author, ['affiliation', 'country'], 'Unknown')
                    for author in author_groups
                ]
                countries_string = ','.join(countries)

                # Extract keywords
                auth_keywords = safe_get(data, ['abstracts-retrieval-response', 'authkeywords', 'author-keyword'], [])
                keywords = [keyword.get('$', '') for keyword in auth_keywords if isinstance(keyword, dict)]
                keywords_string = ','.join(keywords)

                # Normalize JSON data and filter columns
                row_data = pd.json_normalize(data.get('abstracts-retrieval-response', {}))
                row = {col: row_data[col].iloc[0] if col in row_data else pd.NA for col in columns_to_keep}

                # Add processed fields
                row['item.bibrecord.head.author-group.affiliation.country'] = countries_string
                row['authkeywords.author-keyword'] = keywords_string

                # Append the row to data_rows
                data_rows.append(row)

            except Exception as e:
                print(f"Error processing file {file_name}: {e}")

# Convert the list of rows into a DataFrame
df = pd.DataFrame(data_rows)

# Save the DataFrame to a CSV file
output_path = f'output_{year}.csv'
df.to_csv(output_path, index=False, encoding='utf-8')



In [5]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3815 entries, 0 to 3814
Data columns (total 25 columns):
 #   Column                                                         Non-Null Count  Dtype 
---  ------                                                         --------------  ----- 
 0   coredata.srctype                                               3815 non-null   object
 1   coredata.eid                                                   3815 non-null   object
 2   coredata.dc:description                                        3692 non-null   object
 3   coredata.pubmed-id                                             1406 non-null   object
 4   coredata.prism:coverDate                                       3815 non-null   object
 5   coredata.prism:aggregationType                                 3815 non-null   object
 6   coredata.prism:url                                             3815 non-null   object
 7   coredata.source-id                                             3815 n

In [6]:
print(df.head(5))

  coredata.srctype        coredata.eid  \
0                b  2-s2.0-85128088236   
1                j  2-s2.0-85122493175   
2                j  2-s2.0-85127926459   
3                j  2-s2.0-85127811920   
4                j  2-s2.0-85126121321   

                             coredata.dc:description coredata.pubmed-id  \
0                                               <NA>               <NA>   
1  We construct a Lagrangian for general nonlinea...           35061442   
2                                               <NA>           35390937   
3                                               <NA>           35390938   
4  Developing complex algorithms on 8-bit process...               <NA>   

  coredata.prism:coverDate coredata.prism:aggregationType  \
0               2021-12-31                           Book   
1               2021-12-31                        Journal   
2               2021-12-30                        Journal   
3               2021-12-30                        Jo