In [400]:
import json
import pandas as pd

# Load the JSON data
with open('results_subsample/chatgpt_results_hierarchical2_description_title50.json', 'r') as file:
    data = json.load(file)

In [300]:
# Initialize a list to store the results
results = []

# Iterate through each PUBID and its categories
for pubid, categories in data.items():
    # Initialize a list to store the disciplines with a final score above 0.5
    retained_disciplines = []
    highest_score = 0
    highest_discipline = None
    
    for category, details in categories.items():
        category_score = float(details['score']) / 100
        # Sort disciplines by their final score in descending order
        sorted_disciplines = sorted(details['disciplines'].items(), key=lambda x: float(x[1]) / 100 * category_score, reverse=True)
        
        added_disciplines = 0  # Counter for added disciplines per category
        for discipline, score in sorted_disciplines:
            final_score = category_score * float(score) / 100
            # Check if the final score is above 0.5 and limit to top 3 per category
            if final_score > 0.5 and added_disciplines < 3:
                retained_disciplines.append(discipline)
                added_disciplines += 1
            # Update the highest score discipline if no discipline is above 0.5
            if final_score > highest_score:
                highest_score = final_score
                highest_discipline = discipline
                
    # If no discipline has a final score above 0.5, add the discipline with the highest score
    if not retained_disciplines and highest_discipline:
        retained_disciplines.append(highest_discipline)
    
    # Append the result
    results.append({'PUBID': pubid, 'chatgpt': retained_disciplines})

# Convert the results to a DataFrame
results_df = pd.DataFrame(results)

results_df

Unnamed: 0,PUBID,chatgpt
0,485845700001,"[sociology, law, psychology, religion, history..."
1,490797000001,"[chemical engineering, mechanical engineering,..."
2,494419600001,"[electrical engineering, electronic engineerin..."
3,513452400001,"[educational sciences, psychology, other socia..."
4,536595900001,"[sociology, law, political science, religion]"
...,...,...
1564,795066100001,[philosophy and ethics]
1565,869323200001,"[philosophy and ethics, psychology]"
1566,743578300007,"[economics and business, political science, so..."
1567,772869700001,"[industrial biotechnology, other engineering a..."


In [281]:
type(data)

dict

In [401]:
import json
import pandas as pd

# Assuming the JSON data is loaded as shown previously

# Initialize a list to store the results
results = []

# Iterate through each PUBID and its categories
for pubid, categories in data.items():
    all_disciplines = []
    highest_score_discipline = (None, 0)  # Tuple to track the discipline with the highest score and its score

    # Process each category
    for category, details in categories.items():
        category_score = float(details['score']) / 100
        sorted_disciplines = sorted(details['disciplines'].items(), key=lambda x: category_score * float(x[1]) / 100, reverse=True)

        added_per_category = 0
        for discipline, score in sorted_disciplines:
            final_score = category_score * float(score) / 100
            if final_score > highest_score_discipline[1]:  # Update the highest scoring discipline if needed
                highest_score_discipline = (discipline, final_score)
            if final_score > 0.5 and added_per_category < 3:
                all_disciplines.append((discipline, final_score, category))
                added_per_category += 1

    # Enforce the total limit of 5 disciplines per PUBID
    all_disciplines_sorted = sorted(all_disciplines, key=lambda x: x[1], reverse=True)

    retained_disciplines = []
    category_counts = {}
    for discipline, _, category in all_disciplines_sorted:
        if len(retained_disciplines) < 5:
            if category_counts.get(category, 0) < 3:
                retained_disciplines.append(discipline)
                category_counts[category] = category_counts.get(category, 0) + 1

    # If no discipline passes the threshold, add the one with the highest score
    if not retained_disciplines and highest_score_discipline[0]:
        retained_disciplines.append(highest_score_discipline[0])

    # Append the result
    results.append({'PUBID': pubid, 'chatgpt': retained_disciplines})

# Convert the results to a DataFrame
results_df = pd.DataFrame(results)

In [374]:
import json
import pandas as pd

# Initialize a list to store the results
results = []

# Iterate through each PUBID and its categories
for pubid, categories in data.items():
    all_disciplines = []
    highest_score_discipline_backup = (None, 0)  # Backup: highest score overall
    highest_score_disciplines_per_category = []  # For categories with score > 0.7

    # Process each category
    for category, details in categories.items():
        category_score = float(details['score']) / 100

        # Identify the discipline with the highest score in this category
        highest_score_discipline_in_category = max(details['disciplines'].items(), key=lambda x: float(x[1]), default=(None, 0))

        # Calculate the combined score for the highest score discipline in this category
        discipline_score = float(highest_score_discipline_in_category[1]) / 100 if highest_score_discipline_in_category[0] else 0
        combined_score_highest_discipline = category_score * discipline_score
        
        # Update backup discipline if this one has the highest combined score so far
        if combined_score_highest_discipline > highest_score_discipline_backup[1]:
            highest_score_discipline_backup = (highest_score_discipline_in_category[0], combined_score_highest_discipline)

        # Keep the discipline if category score > 0.7
        if category_score > 0.7:
            highest_score_disciplines_per_category.append((highest_score_discipline_in_category[0], combined_score_highest_discipline))

        # Additionally, add all disciplines with a combined score > 0.7
        for discipline, score in details['disciplines'].items():
            final_score = category_score * float(score) / 100
            if final_score > 0.6:
                all_disciplines.append((discipline, final_score))

    # Combine the selected disciplines, prioritizing unique entries
    selected_disciplines = set([discipline for discipline, _ in highest_score_disciplines_per_category])
    selected_disciplines.update([discipline for discipline, _ in all_disciplines if discipline not in selected_disciplines])

    # If no discipline is selected based on the criteria, use the backup discipline
    if not selected_disciplines and highest_score_discipline_backup[0]:
        selected_disciplines.add(highest_score_discipline_backup[0])

    # Limit to 5 disciplines at most
    selected_disciplines = list(selected_disciplines)[:5]

    # Append the result
    results.append({'PUBID': pubid, 'chatgpt': list(selected_disciplines)})

# Convert the results to a DataFrame
results_df = pd.DataFrame(results)
results_df

Unnamed: 0,PUBID,chatgpt
0,485845700001,"[history, arts (arts, history of arts, perform..."
1,490797000001,"[chemical sciences, physical sciences, chemica..."
2,494419600001,"[physical sciences, electrical engineering, el..."
3,513452400001,[educational sciences]
4,536595900001,"[law, political science, sociology]"
...,...,...
1564,795066100001,[philosophy and ethics]
1565,869323200001,[philosophy and ethics]
1566,743578300007,"[political science, economics and business]"
1567,772869700001,"[industrial biotechnology, economics and busin..."


In [402]:
results_df['prediction_length'] = results_df.chatgpt.apply(len)

In [403]:
results_df.prediction_length.value_counts()

prediction_length
5    832
4    249
3    226
2    158
1    104
Name: count, dtype: int64

In [404]:
import pandas as pd
test_data = pd.read_csv('H:/data/WoS_data/data_gpt50.csv', usecols=['PUBID', 'ABSTRACT', 'final_disciplines'], dtype=str)

In [405]:
test_data.head(20)

Unnamed: 0,PUBID,ABSTRACT,final_disciplines
0,485845700001,"Over the past three decades, along with the ex...",Other humanities
1,490797000001,This paper reports on CFD simulations of in-cy...,Environmental engineering;Chemical engineering...
2,494419600001,The demand for lean protein is the main reason...,"Electrical engineering, electronic engineering..."
3,513452400001,"In this article, 119 faculty members from 10 S...",Educational sciences
4,514500800001,Co-teaching is a widely used service delivery ...,Educational sciences;Health sciences
5,534713700004,Mindfulness has shown beneficial relationships...,Psychology;Basic medicine
6,536595900001,We examine public perceptions of the police co...,Law
7,548588800001,This study explores how televised football in ...,Media and communications;Sociology;Other socia...
8,549312100001,Petrographic analysis was conducted on pottery...,Archaeology;Other social sciences
9,550529800001,Local governments nationwide have been adoptin...,Economics and business;Social and economic geo...


In [406]:
test_data[test_data.PUBID == '579654000001'].ABSTRACT.values

array(['Tourism research on the LGBTIQ + communities has grown over the years, entering mainstream discussions as a segment of interest. This growing focus reflects greater societal acceptance and acknowledgement of the systemic inequalities that challenge their rights. The landscape of current scholarship, though important to academic literature, policy and practice, has not been explored. On this premise, and under the umbrella of social sustainability, a systematic qualitative review of scholarship on the LGBTIQ + community and tourism was conducted with Q1- and Q2-ranked travel and tourism journals (Scimago Journal & Country Rank) as a basis. Articles were analysed to identify the sampling parameters and their topic foci. The findings suggest the literature focuses on sexually diverse groups (gays and lesbians) who are open about their identity, with limited consideration given to bisexual or gender diverse travellers (intersex and transgender). The topics and language used have al

In [407]:
test_data = test_data.merge(results_df, on='PUBID')

In [408]:
test_data['final_disciplines'] = test_data.final_disciplines.str.lower()
replace_dict = {
    'arts (arts, history of arts, performing arts, music)':'art (arts, history of arts, performing arts, music)'
}
test_data['chatgpt'] = test_data.chatgpt.apply(lambda x: list(set(replace_dict.get(y,y) for y in x)))
data = test_data

In [409]:
data['final_disciplines'] = data.final_disciplines.apply(lambda x: x.split(';'))

In [410]:
data['arts'] = data.chatgpt.apply(lambda x: any(el.startswith('art') for el in x))

In [411]:
data[data.arts]

Unnamed: 0,PUBID,ABSTRACT,final_disciplines,chatgpt,prediction_length,arts
27,618159400001,Despite playing a central role in establishing...,"[archaeology, art (arts, history of arts, perf...","[art (arts, history of arts, performing arts, ...",4,True
30,625269500001,Discussions around social mobility have increa...,[sociology],"[other social sciences, sociology, art (arts, ...",5,True
67,663197200001,This article considers the world-famous rivalr...,"[history, other engineering and technologies, ...","[other natural sciences, art (arts, history of...",5,True
74,668000600001,"Some limited scholarship, focused on the US as...",[political science],"[art (arts, history of arts, performing arts, ...",5,True
75,669109700001,This paper calls for renewed consideration of ...,[sociology],"[media and communications, sociology, art (art...",5,True
...,...,...,...,...,...,...
1547,884909200001,We argue that cross-national variability in ho...,"[law, sociology, economics and business, polit...","[law, sociology, art (arts, history of arts, p...",5,True
1553,885879700006,Junaluska is a historically Black community in...,[archaeology],"[archaeology, sociology, art (arts, history of...",5,True
1555,886113800002,Five glass objects of the Asian family of pota...,[archaeology],"[art (arts, history of arts, performing arts, ...",3,True
1556,886113800006,This article discusses a newly discovered manu...,"[art (arts, history of arts, performing arts, ...","[mathematics, materials engineering, art (arts...",5,True


In [412]:
from sklearn.preprocessing import MultiLabelBinarizer
true_labels = list(data.final_disciplines)
predicted_labels = list(data.chatgpt)
mlb = MultiLabelBinarizer()
true_label_binarized = mlb.fit_transform(true_labels)
predicted_labels_binarized = mlb.transform(predicted_labels)
from sklearn.metrics import classification_report
print(classification_report(true_label_binarized, predicted_labels_binarized, target_names=mlb.classes_))

                                                                         precision    recall  f1-score   support

                                   agriculture, forestry, and fisheries       0.26      0.77      0.38        56
                                               animal and dairy science       0.51      0.73      0.60        45
                                                            archaeology       0.92      0.56      0.70        41
                    art (arts, history of arts, performing arts, music)       0.22      0.56      0.31        57
                                                         basic medicine       0.31      0.42      0.35       100
                                                    biological sciences       0.50      0.71      0.59       194
                                                   chemical engineering       0.12      0.87      0.20        45
                                                      chemical sciences       0.32      0.74   



In [413]:
from sklearn.metrics import accuracy_score
accuracy_score(true_label_binarized, predicted_labels_binarized)

0.017208413001912046

In [414]:
mlb.classes_

array(['agriculture, forestry, and fisheries', 'animal and dairy science',
       'archaeology',
       'art (arts, history of arts, performing arts, music)',
       'basic medicine', 'biological sciences', 'chemical engineering',
       'chemical sciences', 'civil engineering', 'clinical medicine',
       'computer and information sciences',
       'earth and related environmental sciences',
       'economics and business', 'educational sciences',
       'electrical engineering, electronic engineering, information engineering',
       'environmental biotechnology', 'environmental engineering',
       'health biotechnology', 'health sciences', 'history',
       'industrial biotechnology', 'languages and linguistics', 'law',
       'literature', 'materials engineering', 'mathematics',
       'mechanical engineering', 'media and communications',
       'medical engineering', 'nano-technology',
       'other agricultural sciences',
       'other engineering and technologies', 'other human

In [415]:
def filter_labels(labels_list):
    filtered_labels = []
    for labels in labels_list:
        filtered_labels.append([label for label in labels if not label.startswith('other')])
    return filtered_labels

# Filter both true and predicted labels
true_labels_filtered = filter_labels(list(data.final_disciplines))
predicted_labels_filtered = filter_labels(list(data.chatgpt))

# Initialize MultiLabelBinarizer
mlb = MultiLabelBinarizer()

# Transform labels
true_label_binarized = mlb.fit_transform(true_labels_filtered)
predicted_labels_binarized = mlb.transform(predicted_labels_filtered)

# Generate and print classification report
print(classification_report(true_label_binarized, predicted_labels_binarized, target_names=mlb.classes_))

                                                                         precision    recall  f1-score   support

                                   agriculture, forestry, and fisheries       0.26      0.77      0.38        56
                                               animal and dairy science       0.51      0.73      0.60        45
                                                            archaeology       0.92      0.56      0.70        41
                    art (arts, history of arts, performing arts, music)       0.22      0.56      0.31        57
                                                         basic medicine       0.31      0.42      0.35       100
                                                    biological sciences       0.50      0.71      0.59       194
                                                   chemical engineering       0.12      0.87      0.20        45
                                                      chemical sciences       0.32      0.74   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [16]:
data[['ABSTRACT', 'final_disciplines', 'chatgpt']]

Unnamed: 0,ABSTRACT,final_disciplines,chatgpt
0,"Over the past three decades, along with the ex...",[other humanities],"[law, sociology, art (arts, history of arts, p..."
1,This paper reports on CFD simulations of in-cy...,"[environmental engineering, chemical engineeri...","[chemical sciences, materials engineering, che..."
2,The demand for lean protein is the main reason...,"[electrical engineering, electronic engineerin...","[physical sciences, electrical engineering, el..."
3,"In this article, 119 faculty members from 10 S...",[educational sciences],"[other social sciences, psychology, educationa..."
4,Co-teaching is a widely used service delivery ...,"[educational sciences, health sciences]","[other medical sciences, psychology, education..."
...,...,...,...
1564,Simple Summary Use of the antidiabetic drug me...,"[biological sciences, clinical medicine]","[health biotechnology, basic medicine, clinica..."
1565,"In 1970, Los Angeles-based artist Frederick Ev...","[art (arts, history of arts, performing arts, ...","[other social sciences, materials engineering,..."
1566,Multiple sclerosis (MS) is an autoimmune disea...,[basic medicine],"[basic medicine, physical sciences, medical en..."
1567,This work presents a nonlinear model predictiv...,"[environmental engineering, chemical engineeri...","[chemical sciences, agricultural biotechnology..."
