In [None]:
import json
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import os 
from sklearn.cluster import KMeans

# Define the path to the JSON file
json_file_path = 'Project/2023/202300000.json'

# Open and load the JSON file
with open(json_file_path, 'r', encoding='utf-8') as file:
    data = json.load(file)

# Extract countries from author-group
countries = [author['affiliation']['country'] for author in data['abstracts-retrieval-response']['item']['bibrecord']['head']['author-group']]

# Convert the list of countries to a single string
countries_string = ','.join(countries)

# Extract $ values from authkeywords.author-keyword
keywords = [keyword['$'] for keyword in data['abstracts-retrieval-response']['authkeywords']['author-keyword']]

# Convert the list of keywords to a single string
keywords_string = ','.join(keywords)

# Normalize JSON data to a DataFrame
df = pd.json_normalize(data['abstracts-retrieval-response'])

# Define columns to keep
columns_to_keep = [
    'coredata.srctype',
    'coredata.eid',
    'coredata.dc:description',
    'coredata.pubmed-id',
    'coredata.prism:coverDate',
    'coredata.prism:aggregationType',
    'coredata.prism:url',
    'coredata.source-id',
    'coredata.pii',
    'coredata.citedby-count',
    'coredata.prism:volume',
    'coredata.subtype',
    'coredata.dc:title',
    'coredata.openaccess',
    'coredata.prism:issn',
    'coredata.publishercopyright',
    'coredata.article-number',
    'coredata.subtypeDescription',
    'coredata.prism:publicationName',
    'coredata.prism:doi',
    'coredata.dc:identifier',
    'coredata.dc:publisher',
    'item.bibrecord.head.citation-info.citation-language.@language',
]

# Add the countries_string and keywords_string to the DataFrame
df['item.bibrecord.head.author-group.affiliation.country'] = countries_string
df['authkeywords.author-keyword'] = keywords_string

# Keep only the specified columns
df = df[columns_to_keep + ['item.bibrecord.head.author-group.affiliation.country', 'authkeywords.author-keyword']]


df.to_csv('output.csv', index=False)

# Display the DataFrame to verify the contents
df.shape
df