In [2]:
import os
import pandas as pd
import json

In [9]:
directories = ['Project/2018','Project/2019','Project/2020' , 'Project/2021','Project/2022', 'Project/2023']

# Initialize a list to store the rows of the DataFrame
data_rows = []

# Columns to extract
columns_to_keep = [
    'coredata.prism:coverDate',
    'coredata.dc:title',
    'coredata.prism:publicationName',
    'coredata.citedby-count'
]

# Helper function to safely extract a value from a dictionary
def safe_get(d, keys, default=None):
    try:
        for key in keys:
            d = d.get(key, {})
        return d if d else default
    except AttributeError:
        return default

# Iterate through each directory
for folder_path in directories:
    # Iterate through each file in the folder
    for file_name in os.listdir(folder_path):
        if file_name.endswith(".json"):
            file_path = os.path.join(folder_path, file_name)
            with open(file_path, 'r', encoding='utf-8') as file:
                try:
                    data = json.load(file)

                    # Extract countries from author-group
                    author_groups = safe_get(data, ['abstracts-retrieval-response', 'item', 'bibrecord', 'head', 'author-group'], [])
                    countries = [
                        safe_get(author, ['affiliation', 'country'], 'Unknown')
                        for author in author_groups
                    ]
                    countries_string = ','.join(countries)

                    # Extract keywords
                    auth_keywords = safe_get(data, ['abstracts-retrieval-response', 'authkeywords', 'author-keyword'], [])
                    keywords = [keyword.get('$', '') for keyword in auth_keywords if isinstance(keyword, dict)]
                    keywords_string = ','.join(keywords) if keywords else 'null'

                    # Normalize JSON data and filter columns
                    row_data = pd.json_normalize(data.get('abstracts-retrieval-response', {}))
                    row = {col: row_data[col].iloc[0] if col in row_data else pd.NA for col in columns_to_keep}

                    # Add processed fields
                    row['item.bibrecord.head.author-group.affiliation.country'] = countries_string
                    row['authkeywords.author-keyword'] = keywords_string

                    # Append the row to data_rows
                    data_rows.append(row)

                except Exception as e:
                    print(f"Error processing file {file_name} in {folder_path}: {e}")

# Convert the list of rows into a DataFrame
df = pd.DataFrame(data_rows)
# Save the DataFrame to a CSV file
output_path = 'outputAllYear.csv'
df.to_csv(output_path, index=False, encoding='utf-8')

print(f"Data saved to {output_path}")

Data saved to outputAllYear.csv


In [13]:
df

Unnamed: 0,coredata.prism:coverDate,coredata.dc:title,coredata.prism:publicationName,coredata.citedby-count,item.bibrecord.head.author-group.affiliation.country,authkeywords.author-keyword
0,2021-12-31,Role of interventional procedures in channelop...,Catheter Ablation of Cardiac Arrhythmias in Ch...,0,"Spain,Spain,Thailand",
1,2021-12-31,Democratic Lagrangians for Nonlinear Electrody...,Physical Review Letters,9,"United States,Russian Federation,Thailand,Belg...",
2,2021-12-30,Ontogeny and morphological diversity in immatu...,Zootaxa,0,"Thailand,New Zealand,Unknown",
3,2021-12-30,Ontogeny and morphological diversity in immatu...,Zootaxa,4,"Thailand,New Zealand,Unknown",
4,2021-12-30,Improved Development Cycle for 8-bit FPGA-Base...,Engineering Journal,0,"Unknown,Unknown","8-bit soft microprocessor,FPGA,PicoBlaze,Softw..."
...,...,...,...,...,...,...
10944,2023-01-01,Long-chain bio-olefins production via oxidativ...,Catalysis Today,3,"Thailand,Thailand,Thailand,Thailand,Thailand","Long-chain olefins,Mesoporous KIT-6,Oleic acid..."
10945,2023-01-01,Recent Developments and Applications of Microf...,Critical Reviews in Analytical Chemistry,11,"Unknown,Unknown","Biological hazards,chemical hazards,food conta..."
10946,2023-01-01,"Social justice, education and peacebuilding: c...",Compare,5,"United Kingdom,Thailand","conflict,Education,peacebuilding,social justic..."
10947,2023-01-01,Effects of black soldier fly (Hermetia illucen...,Journal of Applied Aquaculture,6,"Thailand,Thailand","Anabas testudineus,Black soldier fly,fish meal..."


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10949 entries, 0 to 10948
Data columns (total 6 columns):
 #   Column                                                Non-Null Count  Dtype 
---  ------                                                --------------  ----- 
 0   coredata.prism:coverDate                              10949 non-null  object
 1   coredata.dc:title                                     10949 non-null  object
 2   coredata.prism:publicationName                        10949 non-null  object
 3   coredata.citedby-count                                10947 non-null  object
 4   item.bibrecord.head.author-group.affiliation.country  10949 non-null  object
 5   authkeywords.author-keyword                           10949 non-null  object
dtypes: object(6)
memory usage: 513.4+ KB


In [12]:
rename_columns = {
    'coredata.prism:coverDate': 'Year',
    'coredata.dc:title': 'Title',
    'coredata.prism:publicationName': 'PublicationName',
    'coredata.citedby-count': 'CitedByCount',
    'item.bibrecord.head.author-group.affiliation.country': 'AffiliationCountry',
    'authkeywords.author-keyword': 'AuthorKeywords'
}
df_renamed = df[list(rename_columns.keys())].rename(columns=rename_columns)

df_renamed['Year'] = df_renamed['Year'].str[:4]
df_renamed = df_renamed.dropna()
df_renamed = df_renamed[df_renamed['AuthorKeywords'] != 'null']
df_renamed = df_renamed[~df_renamed['AffiliationCountry'].str.contains('Unknown')]
df_renamed = df_renamed.reset_index(drop=True)
df_renamed['CitedByCount'] = df_renamed['CitedByCount'].astype(int)

df_renamed

Unnamed: 0,Year,Title,PublicationName,CitedByCount,AffiliationCountry,AuthorKeywords
0,2021,Does proactive logistics management enhance bu...,Polish Journal of Management Studies,0,"Thailand,Thailand,Thailand,Myanmar,Thailand","Business management,Firm size,Logistics awaren..."
1,2021,"Will There Ever Be Cure for Chronic, Life-Chan...",Frontiers in Medicine,3,"Thailand,Thailand,Thailand,Thailand,Thailand,J...","biofilm infections,chronic infection,chronic u..."
2,2021,Bacterial diversity and potential risk factors...,PeerJ,6,"United States,Thailand","Escherichia coli,Risk factors,Salmonella enter..."
3,2021,Global Perspectives on Immunization Against SA...,Frontiers in Immunology,9,"Canada,South Africa,South Africa,United States...","COVID-19,maternal immunization,maternal vaccin..."
4,2021,Deep Learning Enables Prostate MRI Segmentatio...,Frontiers in Oncology,4,"United States,United States,China,Thailand,Uni...","deep attentive neural network,large cohort eva..."
...,...,...,...,...,...,...
7662,2023,RSIAM risk profile for managing risk factors o...,International Journal of Construction Management,5,"Viet Nam,Viet Nam,Viet Nam,Thailand","construction projects,international constructi..."
7663,2023,Long-chain bio-olefins production via oxidativ...,Catalysis Today,3,"Thailand,Thailand,Thailand,Thailand,Thailand","Long-chain olefins,Mesoporous KIT-6,Oleic acid..."
7664,2023,"Social justice, education and peacebuilding: c...",Compare,5,"United Kingdom,Thailand","conflict,Education,peacebuilding,social justic..."
7665,2023,Effects of black soldier fly (Hermetia illucen...,Journal of Applied Aquaculture,6,"Thailand,Thailand","Anabas testudineus,Black soldier fly,fish meal..."


In [14]:
df_renamed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7667 entries, 0 to 7666
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Year                7667 non-null   object
 1   Title               7667 non-null   object
 2   PublicationName     7667 non-null   object
 3   CitedByCount        7667 non-null   int32 
 4   AffiliationCountry  7667 non-null   object
 5   AuthorKeywords      7667 non-null   object
dtypes: int32(1), object(5)
memory usage: 329.6+ KB


In [None]:
df_renamed.to_csv('1_data_ajarn.csv', index=False)