In [14]:
import json
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import os 

# Define the path to the JSON file
json_file_path = 'Project/2023/202300000.json'

# Open and load the JSON file
with open(json_file_path, 'r', encoding='utf-8') as file:
    data = json.load(file)

# Extract countries from author-group
countries = [author['affiliation']['country'] for author in data['abstracts-retrieval-response']['item']['bibrecord']['head']['author-group']]

# Convert the list of countries to a single string
countries_string = ','.join(countries)

# Extract $ values from authkeywords.author-keyword
keywords = [keyword['$'] for keyword in data['abstracts-retrieval-response']['authkeywords']['author-keyword']]

# Convert the list of keywords to a single string
keywords_string = ','.join(keywords)

# Normalize JSON data to a DataFrame
df = pd.json_normalize(data['abstracts-retrieval-response'])

# Define columns to keep
columns_to_keep = [
    'coredata.srctype',
    'coredata.eid',
    'coredata.dc:description',
    'coredata.pubmed-id',
    'coredata.prism:coverDate',
    'coredata.prism:aggregationType',
    'coredata.prism:url',
    'coredata.source-id',
    'coredata.pii',
    'coredata.citedby-count',
    'coredata.prism:volume',
    'coredata.subtype',
    'coredata.dc:title',
    'coredata.openaccess',
    'coredata.prism:issn',
    'coredata.publishercopyright',
    'coredata.article-number',
    'coredata.subtypeDescription',
    'coredata.prism:publicationName',
    'coredata.prism:doi',
    'coredata.dc:identifier',
    'coredata.dc:publisher',
    'item.bibrecord.head.citation-info.citation-language.@language',
]

# Add the countries_string and keywords_string to the DataFrame
df['item.bibrecord.head.author-group.affiliation.country'] = countries_string
df['authkeywords.author-keyword'] = keywords_string

# Keep only the specified columns
df = df[columns_to_keep + ['item.bibrecord.head.author-group.affiliation.country', 'authkeywords.author-keyword']]


df.to_csv('output.csv', index=False)

# Display the DataFrame to verify the contents
df.shape
df

Unnamed: 0,coredata.srctype,coredata.eid,coredata.dc:description,coredata.pubmed-id,coredata.prism:coverDate,coredata.prism:aggregationType,coredata.prism:url,coredata.source-id,coredata.pii,coredata.citedby-count,...,coredata.publishercopyright,coredata.article-number,coredata.subtypeDescription,coredata.prism:publicationName,coredata.prism:doi,coredata.dc:identifier,coredata.dc:publisher,item.bibrecord.head.citation-info.citation-language.@language,item.bibrecord.head.author-group.affiliation.country,authkeywords.author-keyword
0,j,2-s2.0-85170238281,Pyrocatechol violet/copper ion-graphene oxide/...,37633552,2023-12-31,Journal,https://api.elsevier.com/content/abstract/scop...,17544,S0141813023032129,0,...,© 2023 Elsevier B.V.,126316,Article,International Journal of Biological Macromolec...,10.1016/j.ijbiomac.2023.126316,SCOPUS_ID:85170238281,Elsevier B.V.,English,"Thailand,Thailand,Thailand","Alzheimer's disease,Hydrogel colorimetric sens..."


In [None]:
# Path to the folder containing JSON files
folder_path = 'Project/2023'

# Initialize a list to store the rows of the DataFrame
data_rows = []

# Columns to extract
columns_to_keep = [
    'coredata.dc:title',
    'coredata.prism:publicationName',
    'item.bibrecord.head.citation-info.citation-language.@language',
    'coredata.citedby-count'
]

# Helper function to safely extract a value from a dictionary
def safe_get(d, keys, default=None):
    try:
        for key in keys:
            d = d.get(key, {})
        return d if d else default
    except AttributeError:
        return default

# Iterate through each file in the folder
for file_name in os.listdir(folder_path):
    if file_name.startswith("2023") and file_name.endswith(".json"):
        file_path = os.path.join(folder_path, file_name)
        with open(file_path, 'r', encoding='utf-8') as file:
            try:
                data = json.load(file)

                # Extract countries from author-group
                author_groups = safe_get(data, ['abstracts-retrieval-response', 'item', 'bibrecord', 'head', 'author-group'], [])
                countries = [
                    safe_get(author, ['affiliation', 'country'], 'Unknown')
                    for author in author_groups
                ]
                countries_string = ','.join(countries)

                # Extract keywords
                auth_keywords = safe_get(data, ['abstracts-retrieval-response', 'authkeywords', 'author-keyword'], [])
                keywords = [keyword.get('$', '') for keyword in auth_keywords if isinstance(keyword, dict)]
                keywords_string = ','.join(keywords)

                # Normalize JSON data and filter columns
                row_data = pd.json_normalize(data.get('abstracts-retrieval-response', {}))
                row = {col: row_data[col].iloc[0] if col in row_data else pd.NA for col in columns_to_keep}

                # Add processed fields
                row['item.bibrecord.head.author-group.affiliation.country'] = countries_string
                row['authkeywords.author-keyword'] = keywords_string

                # Append the row to data_rows
                data_rows.append(row)

            except Exception as e:
                print(f"Error processing file {file_name}: {e}")

# Convert the list of rows into a DataFrame
df = pd.DataFrame(data_rows)

# Save the DataFrame to a CSV file
output_path = 'outputAllplayground.csv'
df.to_csv(output_path, index=False, encoding='utf-8')

# Display the DataFrame for verification
print(df)

                                      coredata.dc:title  \
0     Graphene oxide-alginate hydrogel-based indicat...   
1     Rare coordination behavior of triethanolamine ...   
2     Total ammonia nitrogen removal and microbial c...   
3     Effects of microaeration and sludge recirculat...   
4     Bioaccumulation of heavy metals in commerciall...   
...                                                 ...   
2885  Long-chain bio-olefins production via oxidativ...   
2886  Recent Developments and Applications of Microf...   
2887  Social justice, education and peacebuilding: c...   
2888  Effects of black soldier fly (Hermetia illucen...   
2889  Effects of remittances on household poverty an...   

                         coredata.prism:publicationName  \
0     International Journal of Biological Macromolec...   
1                        Journal of Molecular Structure   
2                                           Aquaculture   
3                      Science of the Total Environment

In [18]:
# Define the path to the directory containing the JSON files
directory_path = 'Project/2023/'

# Initialize sets to store unique countries and keywords
unique_countries = set()
unique_keywords = set()

# Iterate through each file in the directory
for file_name in os.listdir(directory_path):
    if file_name.endswith('.json'):
        file_path = os.path.join(directory_path, file_name)
        try:
            with open(file_path, 'r', encoding='utf-8') as file:
                data = json.load(file)

                # Extract countries from author-group
                author_groups = safe_get(data, ['abstracts-retrieval-response', 'item', 'bibrecord', 'head', 'author-group'], [])
                countries = {
                    safe_get(author, ['affiliation', 'country'], 'Unknown')
                    for author in author_groups
                }
                unique_countries.update(countries)

                # Extract keywords
                auth_keywords = safe_get(data, ['abstracts-retrieval-response', 'authkeywords', 'author-keyword'], [])
                keywords = {keyword.get('$', '') for keyword in auth_keywords if isinstance(keyword, dict)}
                unique_keywords.update(keywords)

        except Exception as e:
            print(f"Error processing file {file_name}: {e}")

# Convert the sets to lists for easier handling
unique_countries_list = list(unique_countries)
unique_keywords_list = list(unique_keywords)

# Display the unique countries and keywords
print("Countries:", unique_countries_list)
print("Count Countries:", len(unique_countries_list))
print("\nUnique Keywords:", unique_keywords_list)
print("Count of Unique Keywords:", len(unique_keywords_list))

Countries: ['Sweden', 'South Africa', 'Armenia', 'Netherlands', 'Maldives', 'Macao', 'Slovenia', 'Bangladesh', 'Mexico', 'Taiwan', 'Georgia', 'Bhutan', 'South Sudan', 'Algeria', 'Zimbabwe', 'Iran', 'Hungary', 'Latvia', 'Belgium', 'Denmark', 'Italy', 'Albania', 'Iceland', 'Democratic Republic Congo', 'Madagascar', 'Guatemala', 'Botswana', 'Togo', 'Bulgaria', 'Sri Lanka', 'South Korea', 'Sudan', 'Kyrgyzstan', 'Brunei Darussalam', 'North Macedonia', 'Qatar', 'United States', 'Bahrain', 'Mali', 'India', 'Venezuela', 'Russian Federation', 'Viet Nam', 'Canada', 'Oman', 'Ireland', 'Panama', 'Senegal', 'Cuba', 'Montenegro', 'Yemen', 'Peru', 'Pakistan', 'Finland', 'Paraguay', 'Tunisia', 'Kazakhstan', 'Moldova', 'Austria', 'Malaysia', 'Rwanda', 'Hong Kong', 'Cameroon', 'Czech Republic', 'Cambodia', 'Gabon', 'Niger', 'Portugal', 'Switzerland', 'New Zealand', 'Singapore', 'Trinidad and Tobago', 'Syrian Arab Republic', 'Jordan', 'France', "Cote d'Ivoire", 'Argentina', 'United Arab Emirates', 'Sierr

In [17]:
year = 2023
# Path to the folder containing JSON files
folder_path = f'Project/{year}'

# Initialize a list to store the rows of the DataFrame
data_rows = []

# Columns to extract
columns_to_keep = [
    'coredata.srctype',
    'coredata.eid',
    'coredata.dc:description',
    'coredata.pubmed-id',
    'coredata.prism:coverDate',
    'coredata.prism:aggregationType',
    'coredata.prism:url',
    'coredata.source-id',
    'coredata.pii',
    'coredata.citedby-count',
    'coredata.prism:volume',
    'coredata.subtype',
    'coredata.dc:title',
    'coredata.openaccess',
    'coredata.prism:issn',
    'coredata.publishercopyright',
    'coredata.article-number',
    'coredata.subtypeDescription',
    'coredata.prism:publicationName',
    'coredata.prism:doi',
    'coredata.dc:identifier',
    'coredata.dc:publisher',
    'item.bibrecord.head.citation-info.citation-language.@language',
]

# Helper function to safely extract a value from a dictionary
def safe_get(d, keys, default=None):
    try:
        for key in keys:
            d = d.get(key, {})
        return d if d else default
    except AttributeError:
        return default

# Iterate through each file in the folder
for file_name in os.listdir(folder_path):
    if file_name.startswith(str(year)) and file_name.endswith(".json"):
        file_path = os.path.join(folder_path, file_name)
        with open(file_path, 'r', encoding='utf-8') as file:
            try:
                data = json.load(file)

                # Extract countries from author-group
                author_groups = safe_get(data, ['abstracts-retrieval-response', 'item', 'bibrecord', 'head', 'author-group'], [])
                countries = [
                    safe_get(author, ['affiliation', 'country'], 'Unknown')
                    for author in author_groups
                ]
                countries_string = ','.join(countries)

                # Extract keywords
                auth_keywords = safe_get(data, ['abstracts-retrieval-response', 'authkeywords', 'author-keyword'], [])
                keywords = [keyword.get('$', '') for keyword in auth_keywords if isinstance(keyword, dict)]
                keywords_string = ','.join(keywords) if keywords else 'null'

                # Normalize JSON data and filter columns
                row_data = pd.json_normalize(data.get('abstracts-retrieval-response', {}))
                row = {col: row_data[col].iloc[0] if col in row_data else pd.NA for col in columns_to_keep}

                # Add processed fields
                row['item.bibrecord.head.author-group.affiliation.country'] = countries_string
                row['authkeywords.author-keyword'] = keywords_string

                # Append the row to data_rows
                data_rows.append(row)

            except Exception as e:
                print(f"Error processing file {file_name}: {e}")

# Convert the list of rows into a DataFrame
df = pd.DataFrame(data_rows)

# Save the DataFrame to a CSV file
output_path = f'output_{year}.csv'
df.to_csv(output_path, index=False, encoding='utf-8')

df

Unnamed: 0,coredata.srctype,coredata.eid,coredata.dc:description,coredata.pubmed-id,coredata.prism:coverDate,coredata.prism:aggregationType,coredata.prism:url,coredata.source-id,coredata.pii,coredata.citedby-count,...,coredata.publishercopyright,coredata.article-number,coredata.subtypeDescription,coredata.prism:publicationName,coredata.prism:doi,coredata.dc:identifier,coredata.dc:publisher,item.bibrecord.head.citation-info.citation-language.@language,item.bibrecord.head.author-group.affiliation.country,authkeywords.author-keyword
0,j,2-s2.0-85170238281,Pyrocatechol violet/copper ion-graphene oxide/...,37633552,2023-12-31,Journal,https://api.elsevier.com/content/abstract/scop...,17544,S0141813023032129,0,...,© 2023 Elsevier B.V.,126316,Article,International Journal of Biological Macromolec...,10.1016/j.ijbiomac.2023.126316,SCOPUS_ID:85170238281,Elsevier B.V.,English,"Thailand,Thailand,Thailand","Alzheimer's disease,Hydrogel colorimetric sens..."
1,j,2-s2.0-85169978316,"Herein, unusual and rare coordination behavior...",,2023-12-15,Journal,https://api.elsevier.com/content/abstract/scop...,24642,S0022286023015065,0,...,© 2023 Elsevier B.V.,136416,Article,Journal of Molecular Structure,10.1016/j.molstruc.2023.136416,SCOPUS_ID:85169978316,Elsevier B.V.,English,"India,India,India,India,Thailand","Copper(II),Hirshfeld calculations,Molecular do..."
2,j,2-s2.0-85165929707,This study assessed the characteristics of a z...,,2023-12-15,Journal,https://api.elsevier.com/content/abstract/scop...,29419,S0044848623006725,0,...,© 2023 Elsevier B.V.,739898,Article,Aquaculture,10.1016/j.aquaculture.2023.739898,SCOPUS_ID:85165929707,Elsevier B.V.,English,"Thailand,Thailand,Thailand,Thailand","Ammonia removal,Carrying capacity,Microbiome,O..."
3,j,2-s2.0-85167831666,A novel anaerobic baffled biofilm-membrane bio...,37582447,2023-12-10,Journal,https://api.elsevier.com/content/abstract/scop...,25349,S0048969723048738,0,...,© 2023 Elsevier B.V.,166248,Article,Science of the Total Environment,10.1016/j.scitotenv.2023.166248,SCOPUS_ID:85167831666,Elsevier B.V.,English,"Thailand,Thailand","Anaerobic baffled biofilm–MBR (AnBB-MBR),Membr..."
4,j,2-s2.0-85165076456,Safety of aquatic products is one of the impor...,,2023-12-10,Journal,https://api.elsevier.com/content/abstract/scop...,21100390177,S2352485523002700,0,...,© 2023 Elsevier B.V.,103080,Article,Regional Studies in Marine Science,10.1016/j.rsma.2023.103080,SCOPUS_ID:85165076456,Elsevier B.V.,English,"India,India,India,Israel,Thailand,India,Saudi ...","Contamination,Health risk assessment,Heavy met..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2885,j,2-s2.0-85111945558,Long-chain α-olefins (≥ C10) are normally appl...,,2023-01-01,Journal,https://api.elsevier.com/content/abstract/scop...,16377,S0920586121003497,3,...,© 2021 Elsevier B.V.,,Article,Catalysis Today,10.1016/j.cattod.2021.07.034,SCOPUS_ID:85111945558,Elsevier B.V.,English,"Thailand,Thailand,Thailand,Thailand,Thailand","Long-chain olefins,Mesoporous KIT-6,Oleic acid..."
2886,j,2-s2.0-85111408415,"Nowadays, food safety has become a major conce...",34304654,2023-01-01,Journal,https://api.elsevier.com/content/abstract/scop...,23973,,11,...,"© 2021 Taylor & Francis Group, LLC.",,Review,Critical Reviews in Analytical Chemistry,10.1080/10408347.2021.1949695,SCOPUS_ID:85111408415,Taylor and Francis Ltd.,English,"Unknown,Unknown","Biological hazards,chemical hazards,food conta..."
2887,j,2-s2.0-85110903700,Education is increasingly becoming central to ...,,2023-01-01,Journal,https://api.elsevier.com/content/abstract/scop...,12860,,5,...,© 2021 The Author(s). Published by Informa UK ...,,Article,Compare,10.1080/03057925.2021.1951666,SCOPUS_ID:85110903700,Routledge,English,"United Kingdom,Thailand","conflict,Education,peacebuilding,social justic..."
2888,j,2-s2.0-85106740832,The effects of replacing fish meal protein wit...,,2023-01-01,Journal,https://api.elsevier.com/content/abstract/scop...,13512,,6,...,© 2021 Taylor & Francis.,,Article,Journal of Applied Aquaculture,10.1080/10454438.2021.1923609,SCOPUS_ID:85106740832,Taylor and Francis Ltd.,English,"Thailand,Thailand","Anabas testudineus,Black soldier fly,fish meal..."


In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

# สมมุติว่า 'authkeywords' คือคอลัมน์ของคำสำคัญ
tfidf_vectorizer = TfidfVectorizer(max_features=500)  # จำกัดจำนวนฟีเจอร์
tfidf_features = tfidf_vectorizer.fit_transform(df['authkeywords.author-keyword'])
tfidf_df = pd.DataFrame(
    tfidf_features.toarray(), 
    columns=tfidf_vectorizer.get_feature_names_out()
)

df = pd.concat([df.reset_index(drop=True), tfidf_df.reset_index(drop=True)], axis=1)

In [16]:
df

Unnamed: 0,coredata.srctype,coredata.eid,coredata.dc:description,coredata.pubmed-id,coredata.prism:coverDate,coredata.prism:aggregationType,coredata.prism:url,coredata.source-id,coredata.pii,coredata.citedby-count,...,cu2,cysteine,disease,displacement,go,hydrogel,indicator,pv,sensor,urinary
0,j,2-s2.0-85170238281,Pyrocatechol violet/copper ion-graphene oxide/...,37633552,2023-12-31,Journal,https://api.elsevier.com/content/abstract/scop...,17544,S0141813023032129,0,...,0.242536,0.242536,0.242536,0.242536,0.242536,0.485071,0.242536,0.242536,0.242536,0.242536
