In [23]:
import json
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import os 

# Define the path to the JSON file
json_file_path = 'Project/2023/202300000.json'

# Open and load the JSON file
with open(json_file_path, 'r', encoding='utf-8') as file:
    data = json.load(file)

# Extract countries from author-group
countries = [author['affiliation']['country'] for author in data['abstracts-retrieval-response']['item']['bibrecord']['head']['author-group']]

# Convert the list of countries to a single string
countries_string = ','.join(countries)

# Extract $ values from authkeywords.author-keyword
keywords = [keyword['$'] for keyword in data['abstracts-retrieval-response']['authkeywords']['author-keyword']]

# Convert the list of keywords to a single string
keywords_string = ','.join(keywords)

# Normalize JSON data to a DataFrame
df = pd.json_normalize(data['abstracts-retrieval-response'])

# Define columns to keep
columns_to_keep = [
    'coredata.srctype',
    'coredata.eid',
    'coredata.dc:description',
    'coredata.pubmed-id',
    'coredata.prism:coverDate',
    'coredata.prism:aggregationType',
    'coredata.prism:url',
    'coredata.source-id',
    'coredata.pii',
    'coredata.citedby-count',
    'coredata.prism:volume',
    'coredata.subtype',
    'coredata.dc:title',
    'coredata.openaccess',
    'coredata.prism:issn',
    'coredata.publishercopyright',
    'coredata.article-number',
    'coredata.subtypeDescription',
    'coredata.prism:publicationName',
    'coredata.prism:doi',
    'coredata.dc:identifier',
    'coredata.dc:publisher',
    'item.bibrecord.head.citation-info.citation-language.@language',
]

# Add the countries_string and keywords_string to the DataFrame
df['item.bibrecord.head.author-group.affiliation.country'] = countries_string
df['authkeywords.author-keyword'] = keywords_string

# Keep only the specified columns
df = df[columns_to_keep + ['item.bibrecord.head.author-group.affiliation.country', 'authkeywords.author-keyword']]


df.to_csv('output.csv', index=False)

# Display the DataFrame to verify the contents
df.shape
df

Unnamed: 0,coredata.srctype,coredata.eid,coredata.dc:description,coredata.pubmed-id,coredata.prism:coverDate,coredata.prism:aggregationType,coredata.prism:url,coredata.source-id,coredata.pii,coredata.citedby-count,...,coredata.publishercopyright,coredata.article-number,coredata.subtypeDescription,coredata.prism:publicationName,coredata.prism:doi,coredata.dc:identifier,coredata.dc:publisher,item.bibrecord.head.citation-info.citation-language.@language,item.bibrecord.head.author-group.affiliation.country,authkeywords.author-keyword
0,j,2-s2.0-85170238281,Pyrocatechol violet/copper ion-graphene oxide/...,37633552,2023-12-31,Journal,https://api.elsevier.com/content/abstract/scop...,17544,S0141813023032129,0,...,© 2023 Elsevier B.V.,126316,Article,International Journal of Biological Macromolec...,10.1016/j.ijbiomac.2023.126316,SCOPUS_ID:85170238281,Elsevier B.V.,English,"Thailand,Thailand,Thailand","Alzheimer's disease,Hydrogel colorimetric sens..."


In [30]:
directories = ['Project/2018','Project/2019','Project/2020' , 'Project/2021','Project/2022', 'Project/2023']

# Initialize a list to store the rows of the DataFrame
data_rows = []

# Columns to extract
columns_to_keep = [
    'coredata.dc:title',
    'coredata.prism:publicationName',
    'item.bibrecord.head.citation-info.citation-language.@language',
    'coredata.citedby-count'
]

# Helper function to safely extract a value from a dictionary
def safe_get(d, keys, default=None):
    try:
        for key in keys:
            d = d.get(key, {})
        return d if d else default
    except AttributeError:
        return default

# Iterate through each directory
for folder_path in directories:
    # Iterate through each file in the folder
    for file_name in os.listdir(folder_path):
        if file_name.endswith(".json"):
            file_path = os.path.join(folder_path, file_name)
            with open(file_path, 'r', encoding='utf-8') as file:
                try:
                    data = json.load(file)

                    # Extract countries from author-group
                    author_groups = safe_get(data, ['abstracts-retrieval-response', 'item', 'bibrecord', 'head', 'author-group'], [])
                    countries = [
                        safe_get(author, ['affiliation', 'country'], 'Unknown')
                        for author in author_groups
                    ]
                    countries_string = ','.join(countries)

                    # Extract keywords
                    auth_keywords = safe_get(data, ['abstracts-retrieval-response', 'authkeywords', 'author-keyword'], [])
                    keywords = [keyword.get('$', '') for keyword in auth_keywords if isinstance(keyword, dict)]
                    keywords_string = ','.join(keywords)

                    # Normalize JSON data and filter columns
                    row_data = pd.json_normalize(data.get('abstracts-retrieval-response', {}))
                    row = {col: row_data[col].iloc[0] if col in row_data else pd.NA for col in columns_to_keep}

                    # Add processed fields
                    row['item.bibrecord.head.author-group.affiliation.country'] = countries_string
                    row['authkeywords.author-keyword'] = keywords_string

                    # Append the row to data_rows
                    data_rows.append(row)

                except Exception as e:
                    print(f"Error processing file {file_name} in {folder_path}: {e}")

# Convert the list of rows into a DataFrame
df = pd.DataFrame(data_rows)

In [31]:
# Save the DataFrame to a CSV file
output_path = 'outputAllYear.csv'
df.to_csv(output_path, index=False, encoding='utf-8')

# Display the DataFrame for verification
print(df)

                                       coredata.dc:title  \
0      Public health and international epidemiology f...   
1      Flexible Printed Active Antenna for Digital Te...   
2      Parametric study of hydrogen production via so...   
3      Superhydrophobic coating from fluoroalkylsilan...   
4      Electrochemical impedance-based DNA sensor usi...   
...                                                  ...   
20211  Long-chain bio-olefins production via oxidativ...   
20212  Recent Developments and Applications of Microf...   
20213  Social justice, education and peacebuilding: c...   
20214  Effects of black soldier fly (Hermetia illucen...   
20215  Effects of remittances on household poverty an...   

                          coredata.prism:publicationName  \
0      Radiology in Global Health: Strategies, Implem...   
1        Progress in Electromagnetics Research Symposium   
2                           Chemical Engineering Science   
3                                Applie

In [35]:
# Extract unique countries
unique_countries = set()
for countries in df['item.bibrecord.head.author-group.affiliation.country'].dropna():
    unique_countries.update(countries.split(','))

# Extract unique author keywords
unique_keywords = set()
for keywords in df['authkeywords.author-keyword'].dropna():
    unique_keywords.update(keyword for keyword in keywords.split(',') if keyword)

# Convert sets to lists for easier handling
unique_countries_list = list(unique_countries)
unique_keywords_list = list(unique_keywords)

# Display the unique countries and keywords
print("Unique Countries:", unique_countries_list)
print("Count of Unique Countries:", len(unique_countries_list))
print("\nUnique Author Keywords:", unique_keywords_list)
print("Count of Unique Author Keywords:", len(unique_keywords_list))

Unique Countries: ['Sweden', 'South Africa', 'Armenia', 'Netherlands', 'Maldives', 'Macao', 'Slovenia', 'Barbados', 'Bangladesh', 'Mexico', 'Taiwan', 'Georgia', 'Bhutan', 'South Sudan', 'Congo', 'Algeria', 'Belize', 'Zimbabwe', 'Iran', 'Hungary', 'Latvia', 'Belgium', 'Denmark', 'Italy', 'Albania', 'Iceland', 'Democratic Republic Congo', 'Botswana', 'Monaco', 'Mauritius', 'Guatemala', 'Madagascar', 'Somalia', 'Togo', 'Reunion', 'Bulgaria', 'El Salvador', 'Sri Lanka', 'South Korea', 'Sudan', 'Kyrgyzstan', 'Brunei Darussalam', 'North Macedonia', 'Bahamas', 'Qatar', 'Timor-Leste', 'United States', 'Bahrain', 'Mali', 'Angola', 'Namibia', 'India', 'Venezuela', 'Russian Federation', 'Viet Nam', 'Canada', 'Cuba', 'Ireland', 'Panama', 'Oman', 'Senegal', 'Montenegro', 'Guinea-Bissau', 'Seychelles', 'Yemen', 'Peru', 'Pakistan', 'Chad', 'Finland', 'Paraguay', 'Yugoslavia', 'Azerbaijan', 'Tunisia', 'Kazakhstan', 'Moldova', 'Austria', 'Malaysia', 'Rwanda', 'Hong Kong', 'Cameroon', 'Papua New Guinea'

In [39]:
country_to_continent = {
    "Sweden": "Europe", "South Africa": "Africa", "Armenia": "Asia", "Netherlands": "Europe",
    "Maldives": "Asia", "Macao": "Asia", "Slovenia": "Europe", "Barbados": "North America",
    "Bangladesh": "Asia", "Mexico": "North America", "Taiwan": "Asia", "Georgia": "Asia",
    "Bhutan": "Asia", "South Sudan": "Africa", "Congo": "Africa", "Algeria": "Africa",
    "Belize": "North America", "Zimbabwe": "Africa", "Iran": "Asia", "Hungary": "Europe",
    "Latvia": "Europe", "Belgium": "Europe", "Denmark": "Europe", "Italy": "Europe",
    "Albania": "Europe", "Iceland": "Europe", "Democratic Republic Congo": "Africa",
    "Botswana": "Africa", "Monaco": "Europe", "Mauritius": "Africa", "Guatemala": "North America",
    "Madagascar": "Africa", "Somalia": "Africa", "Togo": "Africa", "Reunion": "Africa",
    "Bulgaria": "Europe", "El Salvador": "North America", "Sri Lanka": "Asia",
    "South Korea": "Asia", "Sudan": "Africa", "Kyrgyzstan": "Asia", "Brunei Darussalam": "Asia",
    "North Macedonia": "Europe", "Bahamas": "North America", "Qatar": "Asia", "Timor-Leste": "Asia",
    "United States": "North America", "Bahrain": "Asia", "Mali": "Africa", "Angola": "Africa",
    "Namibia": "Africa", "India": "Asia", "Venezuela": "South America", "Russian Federation": "Europe",
    "Viet Nam": "Asia", "Canada": "North America", "Cuba": "North America", "Ireland": "Europe",
    "Panama": "North America", "Oman": "Asia", "Senegal": "Africa", "Montenegro": "Europe",
    "Guinea-Bissau": "Africa", "Seychelles": "Africa", "Yemen": "Asia", "Peru": "South America",
    "Pakistan": "Asia", "Chad": "Africa", "Finland": "Europe", "Paraguay": "South America",
    "Yugoslavia": "Europe", "Azerbaijan": "Asia", "Tunisia": "Africa", "Kazakhstan": "Asia",
    "Moldova": "Europe", "Austria": "Europe", "Malaysia": "Asia", "Rwanda": "Africa",
    "Hong Kong": "Asia", "Cameroon": "Africa", "Papua New Guinea": "Oceania",
    "Czech Republic": "Europe", "Cambodia": "Asia", "Gabon": "Africa", "Niger": "Africa",
    "Portugal": "Europe", "Switzerland": "Europe", "New Zealand": "Oceania", "Singapore": "Asia",
    "Trinidad and Tobago": "North America", "Syrian Arab Republic": "Asia", "Jordan": "Asia",
    "France": "Europe", "Cote d'Ivoire": "Africa", "Eritrea": "Africa", "Argentina": "South America",
    "United Arab Emirates": "Asia", "Sierra Leone": "Africa", "Uruguay": "South America",
    "China": "Asia", "Burundi": "Africa", "Cape Verde": "Africa", "Myanmar": "Asia",
    "Egypt": "Africa", "Unknown": "Unknown", "Nicaragua": "North America", "Thailand": "Asia",
    "Liberia": "Africa", "Central African Republic": "Africa", "Puerto Rico": "North America",
    "Ecuador": "South America", "Spain": "Europe", "Brazil": "South America", "Gambia": "Africa",
    "Nepal": "Asia", "Kuwait": "Asia", "Afghanistan": "Asia", "Swaziland": "Africa",
    "Romania": "Europe", "French Guiana": "South America", "Germany": "Europe",
    "Bolivia": "South America", "Australia": "Oceania", "Greece": "Europe", "Uzbekistan": "Asia",
    "Israel": "Asia", "Chile": "South America", "Uganda": "Africa", "Haiti": "North America",
    "Morocco": "Africa", "Poland": "Europe", "Philippines": "Asia", "Mozambique": "Africa",
    "Guyana": "South America", "Zambia": "Africa", "Serbia": "Europe", "Jamaica": "North America",
    "Belarus": "Europe", "Cyprus": "Europe", "United Kingdom": "Europe", "Guinea": "Africa",
    "Honduras": "North America", "Laos": "Asia", "Indonesia": "Asia", "Fiji": "Oceania",
    "Costa Rica": "North America", "Mauritania": "Africa", "Aruba": "North America",
    "Colombia": "South America", "Burkina Faso": "Africa", "Slovakia": "Europe",
    "Dominican Republic": "North America", "Tanzania": "Africa", "Norway": "Europe",
    "Mongolia": "Asia", "Ukraine": "Europe", "Ghana": "Africa", "Iraq": "Asia",
    "Saudi Arabia": "Asia", "Benin": "Africa", "Luxembourg": "Europe",
    "Bosnia and Herzegovina": "Europe", "Lithuania": "Europe", "Ethiopia": "Africa",
    "Libya": "Africa", "Palestine": "Asia", "Lebanon": "Asia", "Croatia": "Europe",
    "Kenya": "Africa", "Malta": "Europe", "Malawi": "Africa", "Estonia": "Europe",
    "Turkey": "Asia", "Japan": "Asia", "Nigeria": "Africa", "Federated States of Micronesia": "Oceania"
}


In [48]:
# ฟังก์ชันเพื่อแปลงประเทศเป็นทวีป
def map_country_to_continent(countries):
    continents = [country_to_continent.get(country, 'Unknown') for country in countries.split(',')]
    return continents

# เพิ่มคอลัมน์ทวีป
df['continent'] = df['item.bibrecord.head.author-group.affiliation.country'].apply(map_country_to_continent)

# คำนวณความถี่ของแต่ละทวีปและแปลงเป็นคอลัมน์ใหม่
def calculate_continent_counts(continents):
    return {continent: continents.count(continent) for continent in set(continents)}

# สร้าง DataFrame จากผลลัพธ์ความถี่
continent_counts = df['continent'].apply(calculate_continent_counts)
continent_df = pd.DataFrame(continent_counts.tolist()).fillna(0).astype(int)

# รวม DataFrame เดิมกับ DataFrame ใหม่
result_df = pd.concat([df, continent_df], axis=1)
result_df.drop(columns='continent', inplace=True)
result_df.drop(columns='item.bibrecord.head.author-group.affiliation.country', inplace=True)

result_df.rename(columns={'coredata.dc:title': 'Title', 'coredata.prism:publicationName': 'PublicationName' , 'item.bibrecord.head.citation-info.citation-language.@language':'Language', 'coredata.citedby-count':'CitedByCount'}, inplace=True)
result_df 

Unnamed: 0,Title,PublicationName,Language,CitedByCount,authkeywords.author-keyword,Asia,North America,Unknown,Europe,South America,Oceania,Africa
0,Public health and international epidemiology f...,"Radiology in Global Health: Strategies, Implem...",English,1,,2,4,0,0,0,0,0
1,Flexible Printed Active Antenna for Digital Te...,Progress in Electromagnetics Research Symposium,English,1,,0,0,2,0,0,0,0
2,Parametric study of hydrogen production via so...,Chemical Engineering Science,English,21,"Circulating fluidized bed,Computational fluid ...",5,0,0,0,0,0,0
3,Superhydrophobic coating from fluoroalkylsilan...,Applied Surface Science,English,37,"Encapsulation,Fluoroalkylsilane,Natural rubber...",6,0,0,0,0,0,0
4,Electrochemical impedance-based DNA sensor usi...,Analytica Chimica Acta,English,68,"acpcPNA,Electrochemical impedance spectroscopy...",5,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
20211,Long-chain bio-olefins production via oxidativ...,Catalysis Today,English,3,"Long-chain olefins,Mesoporous KIT-6,Oleic acid...",5,0,0,0,0,0,0
20212,Recent Developments and Applications of Microf...,Critical Reviews in Analytical Chemistry,English,11,"Biological hazards,chemical hazards,food conta...",0,0,2,0,0,0,0
20213,"Social justice, education and peacebuilding: c...",Compare,English,5,"conflict,Education,peacebuilding,social justic...",1,0,0,1,0,0,0
20214,Effects of black soldier fly (Hermetia illucen...,Journal of Applied Aquaculture,English,6,"Anabas testudineus,Black soldier fly,fish meal...",2,0,0,0,0,0,0


In [50]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import gensim.downloader as api

word2vec = api.load('glove-wiki-gigaword-50')  # ใช้ GloVe vector 50 มิติ

# กรองข้อมูลที่ไม่ว่างเปล่า
result_df = result_df[result_df['authkeywords.author-keyword'].str.strip() != ""]

# แยกคำสำคัญในแต่ละแถว
result_df['keywords_list'] = result_df['authkeywords.author-keyword'].str.split(',')

# สร้าง Word Embeddings สำหรับแต่ละแถว
def get_average_embedding(keywords):
    vectors = [word2vec[word] for word in keywords if word in word2vec]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(word2vec.vector_size)

result_df['embedding'] = result_df['keywords_list'].apply(get_average_embedding)

# แปลง embeddings เป็น array
embeddings = np.vstack(result_df['embedding'].values)

# ใช้ K-Means Clustering
n_clusters = 3  # ปรับจำนวนคลัสเตอร์ตามต้องการ
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
result_df['cluster'] = kmeans.fit_predict(embeddings)

# คำนวณ Silhouette Score
silhouette_avg = silhouette_score(embeddings, result_df['cluster'])
print(f"Silhouette Score: {silhouette_avg}")

# # ดูผลลัพธ์
# print(result_df[['authkeywords.author-keyword', 'cluster']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result_df['keywords_list'] = result_df['authkeywords.author-keyword'].str.split(',')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result_df['embedding'] = result_df['keywords_list'].apply(get_average_embedding)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result_df['cluster'] = kmeans.fit_predi

Silhouette Score: 0.7629708434454862


In [52]:
result_df['embedding'] 

2        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
3        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
4        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
5        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
6        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
                               ...                        
20211    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
20212    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
20213    [0.615335, -0.1855575, -0.402125, -0.154675, 0...
20214    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
20215    [-0.2325725, -0.105677515, 0.34250802, -0.7427...
Name: embedding, Length: 16412, dtype: object

In [37]:
from fpdf import FPDF
pdf = FPDF()

pdf.add_page()
pdf.set_font("Arial", size=12)
pdf.cell(200, 10, txt="Unique Countries and Author Keywords", ln=True, align='C')

# Add unique countries
pdf.set_font("Arial", size=10)
pdf.cell(200, 10, txt="Unique Countries:", ln=True, align='L')
for country in unique_countries_list:
    pdf.cell(200, 10, txt=country.encode('latin-1', 'replace').decode('latin-1'), ln=True, align='L')

# Add a separator
pdf.cell(200, 10, txt="", ln=True, align='L')

# Add unique author keywords
pdf.set_font("Arial", size=10)
pdf.cell(200, 10, txt="Unique Author Keywords:", ln=True, align='L')
for keyword in unique_keywords_list:
    pdf.cell(200, 10, txt=keyword.encode('latin-1', 'replace').decode('latin-1'), ln=True, align='L')

# Save the PDF to a file
output_path = 'unique_countries_and_keywords.pdf'
pdf.output(output_path)

print(f"PDF generated and saved to {output_path}")

PDF generated and saved to unique_countries_and_keywords.pdf


In [32]:
# Initialize sets to store unique countries and keywords
unique_countries = set()
unique_keywords = set()

def safe_get(dct, keys, default=None):
    for key in keys:
        dct = dct.get(key, default)
        if dct is default:
            break
    return dct

# Iterate through each directory
for directory_path in directories:
    # Iterate through each file in the directory
    for file_name in os.listdir(directory_path):
        if file_name.endswith('.json'):
            file_path = os.path.join(directory_path, file_name)
            try:
                with open(file_path, 'r', encoding='utf-8') as file:
                    data = json.load(file)

                    # Extract countries from author-group
                    author_groups = safe_get(data, ['abstracts-retrieval-response', 'item', 'bibrecord', 'head', 'author-group'], [])
                    countries = {
                        safe_get(author, ['affiliation', 'country'], 'Unknown')
                        for author in author_groups
                    }
                    unique_countries.update(countries)

                    # Extract keywords
                    auth_keywords = safe_get(data, ['abstracts-retrieval-response', 'authkeywords', 'author-keyword'], [])
                    keywords = {keyword.get('$', '') for keyword in auth_keywords if isinstance(keyword, dict)}
                    unique_keywords.update(keywords)

            except Exception as e:
                print(f"Error processing file {file_name} in {directory_path}: {e}")

# Convert the sets to lists for easier handling
unique_countries_list = list(unique_countries)
unique_keywords_list = list(unique_keywords)

# Display the unique countries and keywords
print("Countries:", unique_countries_list)
print("Count Countries:", len(unique_countries_list))
print("\nUnique Keywords:", unique_keywords_list)
print("Count of Unique Keywords:", len(unique_keywords_list))

Error processing file 201800000.json in Project/2018: 'NoneType' object has no attribute 'get'
Error processing file 201800001.json in Project/2018: 'str' object has no attribute 'get'
Error processing file 201800005.json in Project/2018: 'str' object has no attribute 'get'
Error processing file 201800007.json in Project/2018: 'NoneType' object has no attribute 'get'
Error processing file 201800009.json in Project/2018: 'NoneType' object has no attribute 'get'
Error processing file 201800011.json in Project/2018: 'str' object has no attribute 'get'
Error processing file 201800014.json in Project/2018: 'NoneType' object has no attribute 'get'
Error processing file 201800015.json in Project/2018: 'str' object has no attribute 'get'
Error processing file 201800016.json in Project/2018: 'NoneType' object has no attribute 'get'
Error processing file 201800020.json in Project/2018: 'NoneType' object has no attribute 'get'
Error processing file 201800021.json in Project/2018: 'NoneType' objec

Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x000002067C57DA90>>
Traceback (most recent call last):
  File "c:\Users\Chulin\miniconda3\Lib\site-packages\ipykernel\ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(

KeyboardInterrupt: 


Error processing file 202103409.json in Project/2021: 'NoneType' object has no attribute 'get'
Error processing file 202103411.json in Project/2021: 'str' object has no attribute 'get'
Error processing file 202103412.json in Project/2021: 'str' object has no attribute 'get'
Error processing file 202103416.json in Project/2021: 'str' object has no attribute 'get'
Error processing file 202103417.json in Project/2021: 'str' object has no attribute 'get'
Error processing file 202103418.json in Project/2021: 'str' object has no attribute 'get'
Error processing file 202103421.json in Project/2021: 'str' object has no attribute 'get'
Error processing file 202103430.json in Project/2021: 'NoneType' object has no attribute 'get'
Error processing file 202103431.json in Project/2021: 'str' object has no attribute 'get'
Error processing file 202103436.json in Project/2021: 'str' object has no attribute 'get'
Error processing file 202103448.json in Project/2021: 'str' object has no attribute 'get'


In [17]:
year = 2023
# Path to the folder containing JSON files
folder_path = f'Project/{year}'

# Initialize a list to store the rows of the DataFrame
data_rows = []

# Columns to extract
columns_to_keep = [
    'coredata.srctype',
    'coredata.eid',
    'coredata.dc:description',
    'coredata.pubmed-id',
    'coredata.prism:coverDate',
    'coredata.prism:aggregationType',
    'coredata.prism:url',
    'coredata.source-id',
    'coredata.pii',
    'coredata.citedby-count',
    'coredata.prism:volume',
    'coredata.subtype',
    'coredata.dc:title',
    'coredata.openaccess',
    'coredata.prism:issn',
    'coredata.publishercopyright',
    'coredata.article-number',
    'coredata.subtypeDescription',
    'coredata.prism:publicationName',
    'coredata.prism:doi',
    'coredata.dc:identifier',
    'coredata.dc:publisher',
    'item.bibrecord.head.citation-info.citation-language.@language',
]

# Helper function to safely extract a value from a dictionary
def safe_get(d, keys, default=None):
    try:
        for key in keys:
            d = d.get(key, {})
        return d if d else default
    except AttributeError:
        return default

# Iterate through each file in the folder
for file_name in os.listdir(folder_path):
    if file_name.startswith(str(year)) and file_name.endswith(".json"):
        file_path = os.path.join(folder_path, file_name)
        with open(file_path, 'r', encoding='utf-8') as file:
            try:
                data = json.load(file)

                # Extract countries from author-group
                author_groups = safe_get(data, ['abstracts-retrieval-response', 'item', 'bibrecord', 'head', 'author-group'], [])
                countries = [
                    safe_get(author, ['affiliation', 'country'], 'Unknown')
                    for author in author_groups
                ]
                countries_string = ','.join(countries)

                # Extract keywords
                auth_keywords = safe_get(data, ['abstracts-retrieval-response', 'authkeywords', 'author-keyword'], [])
                keywords = [keyword.get('$', '') for keyword in auth_keywords if isinstance(keyword, dict)]
                keywords_string = ','.join(keywords) if keywords else 'null'

                # Normalize JSON data and filter columns
                row_data = pd.json_normalize(data.get('abstracts-retrieval-response', {}))
                row = {col: row_data[col].iloc[0] if col in row_data else pd.NA for col in columns_to_keep}

                # Add processed fields
                row['item.bibrecord.head.author-group.affiliation.country'] = countries_string
                row['authkeywords.author-keyword'] = keywords_string

                # Append the row to data_rows
                data_rows.append(row)

            except Exception as e:
                print(f"Error processing file {file_name}: {e}")

# Convert the list of rows into a DataFrame
df = pd.DataFrame(data_rows)

# Save the DataFrame to a CSV file
output_path = f'output_{year}.csv'
df.to_csv(output_path, index=False, encoding='utf-8')

df

Unnamed: 0,coredata.srctype,coredata.eid,coredata.dc:description,coredata.pubmed-id,coredata.prism:coverDate,coredata.prism:aggregationType,coredata.prism:url,coredata.source-id,coredata.pii,coredata.citedby-count,...,coredata.publishercopyright,coredata.article-number,coredata.subtypeDescription,coredata.prism:publicationName,coredata.prism:doi,coredata.dc:identifier,coredata.dc:publisher,item.bibrecord.head.citation-info.citation-language.@language,item.bibrecord.head.author-group.affiliation.country,authkeywords.author-keyword
0,j,2-s2.0-85170238281,Pyrocatechol violet/copper ion-graphene oxide/...,37633552,2023-12-31,Journal,https://api.elsevier.com/content/abstract/scop...,17544,S0141813023032129,0,...,© 2023 Elsevier B.V.,126316,Article,International Journal of Biological Macromolec...,10.1016/j.ijbiomac.2023.126316,SCOPUS_ID:85170238281,Elsevier B.V.,English,"Thailand,Thailand,Thailand","Alzheimer's disease,Hydrogel colorimetric sens..."
1,j,2-s2.0-85169978316,"Herein, unusual and rare coordination behavior...",,2023-12-15,Journal,https://api.elsevier.com/content/abstract/scop...,24642,S0022286023015065,0,...,© 2023 Elsevier B.V.,136416,Article,Journal of Molecular Structure,10.1016/j.molstruc.2023.136416,SCOPUS_ID:85169978316,Elsevier B.V.,English,"India,India,India,India,Thailand","Copper(II),Hirshfeld calculations,Molecular do..."
2,j,2-s2.0-85165929707,This study assessed the characteristics of a z...,,2023-12-15,Journal,https://api.elsevier.com/content/abstract/scop...,29419,S0044848623006725,0,...,© 2023 Elsevier B.V.,739898,Article,Aquaculture,10.1016/j.aquaculture.2023.739898,SCOPUS_ID:85165929707,Elsevier B.V.,English,"Thailand,Thailand,Thailand,Thailand","Ammonia removal,Carrying capacity,Microbiome,O..."
3,j,2-s2.0-85167831666,A novel anaerobic baffled biofilm-membrane bio...,37582447,2023-12-10,Journal,https://api.elsevier.com/content/abstract/scop...,25349,S0048969723048738,0,...,© 2023 Elsevier B.V.,166248,Article,Science of the Total Environment,10.1016/j.scitotenv.2023.166248,SCOPUS_ID:85167831666,Elsevier B.V.,English,"Thailand,Thailand","Anaerobic baffled biofilm–MBR (AnBB-MBR),Membr..."
4,j,2-s2.0-85165076456,Safety of aquatic products is one of the impor...,,2023-12-10,Journal,https://api.elsevier.com/content/abstract/scop...,21100390177,S2352485523002700,0,...,© 2023 Elsevier B.V.,103080,Article,Regional Studies in Marine Science,10.1016/j.rsma.2023.103080,SCOPUS_ID:85165076456,Elsevier B.V.,English,"India,India,India,Israel,Thailand,India,Saudi ...","Contamination,Health risk assessment,Heavy met..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2885,j,2-s2.0-85111945558,Long-chain α-olefins (≥ C10) are normally appl...,,2023-01-01,Journal,https://api.elsevier.com/content/abstract/scop...,16377,S0920586121003497,3,...,© 2021 Elsevier B.V.,,Article,Catalysis Today,10.1016/j.cattod.2021.07.034,SCOPUS_ID:85111945558,Elsevier B.V.,English,"Thailand,Thailand,Thailand,Thailand,Thailand","Long-chain olefins,Mesoporous KIT-6,Oleic acid..."
2886,j,2-s2.0-85111408415,"Nowadays, food safety has become a major conce...",34304654,2023-01-01,Journal,https://api.elsevier.com/content/abstract/scop...,23973,,11,...,"© 2021 Taylor & Francis Group, LLC.",,Review,Critical Reviews in Analytical Chemistry,10.1080/10408347.2021.1949695,SCOPUS_ID:85111408415,Taylor and Francis Ltd.,English,"Unknown,Unknown","Biological hazards,chemical hazards,food conta..."
2887,j,2-s2.0-85110903700,Education is increasingly becoming central to ...,,2023-01-01,Journal,https://api.elsevier.com/content/abstract/scop...,12860,,5,...,© 2021 The Author(s). Published by Informa UK ...,,Article,Compare,10.1080/03057925.2021.1951666,SCOPUS_ID:85110903700,Routledge,English,"United Kingdom,Thailand","conflict,Education,peacebuilding,social justic..."
2888,j,2-s2.0-85106740832,The effects of replacing fish meal protein wit...,,2023-01-01,Journal,https://api.elsevier.com/content/abstract/scop...,13512,,6,...,© 2021 Taylor & Francis.,,Article,Journal of Applied Aquaculture,10.1080/10454438.2021.1923609,SCOPUS_ID:85106740832,Taylor and Francis Ltd.,English,"Thailand,Thailand","Anabas testudineus,Black soldier fly,fish meal..."
