In [1]:
import json
import pandas as pd

# Load the JSON data
with open('results/ChatGPT/intermediate_results/chatgpt_hierarchical_pairwise.json', 'r') as file:
    data = json.load(file)
    
lvl1 = len(['natural sciences', 'engineering and technology', 'agricultural sciences', 'medical and health sciences', 'social sciences', 'humanities'])-1
lvl12 = json.load(open('discipline_structure.json', 'r'))
lvl12 = {key: len(list(values.keys()))-1 for key, values in lvl12.items()}

# Initialize a list to store the results
results = []

# Iterate through each PUBID and its categories
for pubid, categories in data.items():
    # Initialize a list to store the disciplines with a final score above 0.5
    retained_disciplines = []
    highest_score = 0
    highest_discipline = None
    
    for category, details in categories.items():
        category_score = float(details['score']) / lvl1
        # Sort disciplines by their final score in descending order
        sorted_disciplines = sorted(details['subdisciplines'].items(), key=lambda x: float(x[1]) / 100 * category_score, reverse=True)
        
        added_disciplines = 0  # Counter for added disciplines per category
        for discipline, score in sorted_disciplines:
            final_score = category_score * float(score) / lvl12[category]
            # Check if the final score is above 0.5 and limit to top 3 per category
            if final_score >= 0.6: #and added_disciplines < 3:
                retained_disciplines.append(discipline)
                
                if pubid == '516167800001':
                    print(discipline, final_score)
                added_disciplines += 1
            # Update the highest score discipline if no discipline is above 0.5
            if final_score > highest_score:
                highest_score = final_score
                highest_discipline = discipline
                
                
    # If no discipline has a final score above 0.5, add the discipline with the highest score
    if not retained_disciplines and highest_discipline:
        retained_disciplines.append(highest_discipline)
    
    # Append the result
    results.append({'PUBID': pubid, 'chatgpt': retained_disciplines})

# Convert the results to a DataFrame
results_df = pd.DataFrame(results)

results_df

industrial biotechnology 1.0
chemical engineering 0.8
materials engineering 0.8
environmental biotechnology 0.6
biological sciences 0.8000000000000002
earth and related environmental sciences 0.6666666666666666


Unnamed: 0,PUBID,chatgpt
0,486688800001,"[civil engineering, electrical engineering, el..."
1,490689400001,"[agriculture, forestry, and fisheries, animal ..."
2,494013500001,"[media and communications, law, political scie..."
3,497937300001,"[agriculture, forestry, and fisheries, other a..."
4,500234900001,"[materials engineering, nano-technology, liter..."
...,...,...
186,843237200001,"[clinical medicine, basic medicine]"
187,866351500058,"[basic medicine, clinical medicine, biological..."
188,871947700028,"[clinical medicine, health sciences, sociology..."
189,755547200001,"[electrical engineering, electronic engineerin..."


In [2]:
lvl1

5

In [3]:
data['500234900001']

{'agricultural sciences': {'score': 0, 'subdisciplines': {}},
 'engineering and technology': {'score': 4,
  'subdisciplines': {'civil engineering': 7,
   'electrical engineering, electronic engineering, information engineering': 6,
   'mechanical engineering': 4,
   'chemical engineering': 1,
   'materials engineering': 8,
   'medical engineering': 5,
   'environmental engineering': 4,
   'environmental biotechnology': 1,
   'industrial biotechnology': 4,
   'nano-technology': 8,
   'other engineering and technologies': 7}},
 'humanities': {'score': 4,
  'subdisciplines': {'history': 4,
   'archaeology': 0,
   'languages and linguistics': 2,
   'literature': 7,
   'philosophy and ethics': 4,
   'religion': 4,
   'arts (arts, history of arts, performing arts, music)': 6,
   'other humanities': 1}},
 'medical and health sciences': {'score': 2, 'subdisciplines': {}},
 'natural sciences': {'score': 1, 'subdisciplines': {}},
 'social sciences': {'score': 4,
  'subdisciplines': {'psychology'

In [4]:
results_df['prediction_length'] = results_df.chatgpt.apply(len)

In [5]:
results_df.prediction_length.value_counts()

prediction_length
4    53
5    47
6    39
7    25
3    12
2    12
8     2
1     1
Name: count, dtype: int64

In [6]:
import pandas as pd
test_data = pd.read_csv('H:/data/WoS_data/data_gpt10.csv', usecols=['PUBID', 'ABSTRACT', 'final_disciplines'], dtype=str)

In [7]:
test_data = test_data.merge(results_df, on='PUBID')

In [8]:
test_data['final_disciplines'] = test_data.final_disciplines.str.lower()
replace_dict = {
    'arts (arts, history of arts, performing arts, music)':'art (arts, history of arts, performing arts, music)'
}
test_data['chatgpt'] = test_data.chatgpt.apply(lambda x: list(set(replace_dict.get(y,y) for y in x)))
data = test_data

In [9]:
data['final_disciplines'] = data.final_disciplines.apply(lambda x: x.split(';'))

In [10]:
data['arts'] = data.chatgpt.apply(lambda x: any(el.startswith('art') for el in x))

In [11]:
data[data.arts]

Unnamed: 0,PUBID,ABSTRACT,final_disciplines,chatgpt,prediction_length,arts
4,500234900001,Glamour is usually theorized within fashion st...,"[literature, physical sciences, other humaniti...","[literature, media and communications, nano-te...",5,True
5,515343800001,The article intends to give a synoptic view of...,"[archaeology, art (arts, history of arts, perf...","[other social sciences, media and communicatio...",6,True
7,521535000001,This paper contributes to a theoretical discus...,"[other humanities, media and communications, a...","[literature, other social sciences, sociology,...",6,True
8,524076900001,This paper provides a qualitative account of V...,"[languages and linguistics, educational scienc...","[other social sciences, sociology, media and c...",6,True
21,592416200001,This paper develops a framework based on categ...,"[art (arts, history of arts, performing arts, ...","[literature, art (arts, history of arts, perfo...",5,True
27,611026300001,When its entire staff resigned in protest of m...,"[art (arts, history of arts, performing arts, ...","[literature, other humanities, other social sc...",6,True
30,619768700001,Cinematic studies support that the fiction fil...,"[sociology, other humanities, psychology, othe...","[literature, sociology, media and communicatio...",5,True
33,624081100001,This article examines life writing as a resear...,[other humanities],"[literature, sociology, media and communicatio...",5,True
41,639517000001,Supported by an environment that is increasing...,"[computer and information sciences, media and ...","[literature, art (arts, history of arts, perfo...",6,True
66,670822100001,The task of archaeologists is to study the mat...,"[sociology, other humanities, computer and inf...","[media and communications, archaeology, art (a...",5,True


In [12]:
data

Unnamed: 0,PUBID,ABSTRACT,final_disciplines,chatgpt,prediction_length,arts
0,486688800001,"In this paper, we first study the theory of th...","[mathematics, biological sciences, earth and r...","[earth and related environmental sciences, civ...",5,False
1,490689400001,Babassu is a palm tree that occupies a vast ar...,"[agriculture, forestry, and fisheries, animal ...","[agriculture, forestry, and fisheries, basic m...",4,False
2,494013500001,"This study, from a sociosemiotic perspective, ...","[sociology, other humanities, languages and li...","[political science, media and communications, ...",3,False
3,497937300001,The effects of four deferred grazing strategie...,"[animal and dairy science, veterinary science,...","[agriculture, forestry, and fisheries, basic m...",4,False
4,500234900001,Glamour is usually theorized within fashion st...,"[literature, physical sciences, other humaniti...","[literature, media and communications, nano-te...",5,True
...,...,...,...,...,...,...
186,831664900001,The question of women's ordination to offices ...,"[history, religion]","[other social sciences, sociology, media and c...",5,False
187,833337700001,Recent studies on the Buddhist threefold wisdo...,"[other social sciences, philosophy and ethics,...","[religion, basic medicine, clinical medicine, ...",4,False
188,843237200001,The article presents Ignatian (Jesuit) spiritu...,"[educational sciences, religion, health sciences]","[basic medicine, clinical medicine]",2,False
189,866351500058,Administration of human umbilical cord-derived...,"[basic medicine, health biotechnology, biologi...","[basic medicine, clinical medicine, biological...",4,False


In [13]:
from sklearn.preprocessing import MultiLabelBinarizer
true_labels = list(data.final_disciplines)
predicted_labels = list(data.chatgpt)
mlb = MultiLabelBinarizer()
true_label_binarized = mlb.fit_transform(true_labels)
predicted_labels_binarized = mlb.transform(predicted_labels)
from sklearn.metrics import classification_report
print(classification_report(true_label_binarized, predicted_labels_binarized, target_names=mlb.classes_))

                                                                         precision    recall  f1-score   support

                                   agriculture, forestry, and fisheries       0.42      0.50      0.45        10
                                               animal and dairy science       0.91      0.71      0.80        14
                                                            archaeology       0.78      0.58      0.67        12
                    art (arts, history of arts, performing arts, music)       0.42      0.57      0.48        14
                                                         basic medicine       0.25      0.94      0.40        17
                                                    biological sciences       0.57      0.62      0.60        32
                                                   chemical engineering       0.43      0.82      0.56        11
                                                      chemical sciences       0.69      0.30   



In [14]:
from sklearn.metrics import accuracy_score
accuracy_score(true_label_binarized, predicted_labels_binarized)

0.0

In [15]:
mlb.classes_

array(['agriculture, forestry, and fisheries', 'animal and dairy science',
       'archaeology',
       'art (arts, history of arts, performing arts, music)',
       'basic medicine', 'biological sciences', 'chemical engineering',
       'chemical sciences', 'civil engineering', 'clinical medicine',
       'computer and information sciences',
       'earth and related environmental sciences',
       'economics and business', 'educational sciences',
       'electrical engineering, electronic engineering, information engineering',
       'environmental biotechnology', 'environmental engineering',
       'health biotechnology', 'health sciences', 'history',
       'industrial biotechnology', 'languages and linguistics', 'law',
       'literature', 'materials engineering', 'mathematics',
       'mechanical engineering', 'media and communications',
       'medical engineering', 'nano-technology',
       'other agricultural sciences',
       'other engineering and technologies', 'other human

In [16]:
def filter_labels(labels_list):
    filtered_labels = []
    for labels in labels_list:
        filtered_labels.append([label for label in labels if not label.startswith('other')])
    return filtered_labels

# Filter both true and predicted labels
true_labels_filtered = filter_labels(list(data.final_disciplines))
predicted_labels_filtered = filter_labels(list(data.chatgpt))

# Initialize MultiLabelBinarizer
mlb = MultiLabelBinarizer()

# Transform labels
true_label_binarized = mlb.fit_transform(true_labels_filtered)
predicted_labels_binarized = mlb.transform(predicted_labels_filtered)

# Generate and print classification report
print(classification_report(true_label_binarized, predicted_labels_binarized, target_names=mlb.classes_))



                                                                         precision    recall  f1-score   support

                                   agriculture, forestry, and fisheries       0.42      0.50      0.45        10
                                               animal and dairy science       0.91      0.71      0.80        14
                                                            archaeology       0.78      0.58      0.67        12
                    art (arts, history of arts, performing arts, music)       0.42      0.57      0.48        14
                                                         basic medicine       0.25      0.94      0.40        17
                                                    biological sciences       0.57      0.62      0.60        32
                                                   chemical engineering       0.43      0.82      0.56        11
                                                      chemical sciences       0.69      0.30   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [17]:
data[data.PUBID == '516167800001'].final_disciplines.values

array([list(['other agricultural sciences', 'environmental biotechnology', 'environmental engineering', 'chemical sciences'])],
      dtype=object)

In [18]:
data[data.PUBID == '516167800001'].chatgpt.values

array([list(['environmental biotechnology', 'chemical engineering', 'industrial biotechnology', 'earth and related environmental sciences', 'biological sciences', 'materials engineering'])],
      dtype=object)