In [45]:
import json
import pandas as pd

# Load the JSON data
with open('results_subsample/chatgpt_results_non_hierarchical.json', 'r') as file:
    data = json.load(file)

# Initialize a list to store the results
results = []

# Iterate through each PUBID and its categories
for pubid, categories in data.items():
    # Initialize a list to store the disciplines with a final score above 0.5
    retained_disciplines = []
    highest_score = 0
    highest_discipline = None
    
    for discipline, score in categories.items():
        final_score = float(score)/ 100
        if final_score >= 0.75:
            retained_disciplines.append(discipline)
        # Update the highest score discipline if no discipline is above 0.5
        if final_score > highest_score:
            highest_score = final_score
            highest_discipline = discipline
                
    # If no discipline has a final score above 0.5, add the discipline with the highest score
    if not retained_disciplines and highest_discipline:
        retained_disciplines.append(highest_discipline)
    
    # Append the result
    results.append({'PUBID': pubid, 'chatgpt': retained_disciplines})

# Convert the results to a DataFrame
results_df = pd.DataFrame(results)

results_df

Unnamed: 0,PUBID,chatgpt
0,485845700001,"[psychology, law]"
1,490797000001,"[mechanical engineering, chemical engineering,..."
2,494419600001,"[electrical engineering, electronic engineerin..."
3,513452400001,"[educational sciences, psychology]"
4,514500800001,[educational sciences]
...,...,...
1577,716452700014,"[earth and related environmental sciences, phy..."
1578,838931800001,"[chemical engineering, materials engineering]"
1579,716814500001,[physical sciences]
1580,720241400001,[economics and business]


In [46]:
results_df['prediction_length'] = results_df.chatgpt.apply(len)

In [47]:
results_df.prediction_length.value_counts()

prediction_length
1    841
2    424
3    217
4     65
5     35
Name: count, dtype: int64

In [48]:
import pandas as pd
test_data = pd.read_csv('H:/data/WoS_data/data_gpt50.csv', usecols=['PUBID', 'ABSTRACT', 'final_disciplines'], dtype=str)

In [49]:
test_data = test_data.merge(results_df, on='PUBID')

In [50]:
test_data['final_disciplines'] = test_data.final_disciplines.str.lower()
replace_dict = {
    'arts (arts, history of arts, performing arts, music)':'art (arts, history of arts, performing arts, music)'
}
test_data['chatgpt'] = test_data.chatgpt.apply(lambda x: list(set(replace_dict.get(y,y) for y in x)))
data = test_data

In [51]:
data['final_disciplines'] = data.final_disciplines.apply(lambda x: x.split(';'))

In [52]:
data['arts'] = data.chatgpt.apply(lambda x: any(el.startswith('art') for el in x))

In [53]:
data[data.arts]

Unnamed: 0,PUBID,ABSTRACT,final_disciplines,chatgpt,prediction_length,arts
124,703409800001,"For the Bauhaus, the much written about short-...","[art (arts, history of arts, performing arts, ...","[history, art (arts, history of arts, performi...",2,True
133,709310900001,"In the late colonial period, a few Black Mozam...","[art (arts, history of arts, performing arts, ...","[history, art (arts, history of arts, performi...",2,True
184,729321800001,"Dr. Padma Venkataraman, alias A. Mangai, is an...","[social and economic geography, political scie...","[art (arts, history of arts, performing arts, ...",1,True
195,732589300001,"The English word 'mountebank', borrowed from t...","[literature, history]","[history, art (arts, history of arts, performi...",2,True
255,744116600008,This article discusses the textile pedagogies ...,[sociology],"[social and economic geography, art (arts, his...",2,True
276,746718900001,This article explores the theatrical adaptatio...,"[art (arts, history of arts, performing arts, ...","[art (arts, history of arts, performing arts, ...",2,True
286,747985900001,"Arts-based data from a theatre play, Apple Tim...","[educational sciences, languages and linguistics]","[social and economic geography, art (arts, his...",2,True
423,763597700005,"Over the course of its Anglo-Indian career, th...","[social and economic geography, literature, hi...","[history, art (arts, history of arts, performi...",2,True
460,768810100014,"Pelos (2016), by Microlocas collective, is a b...",[literature],"[art (arts, history of arts, performing arts, ...",1,True
496,773284200001,"Zadie Smith's The Autograph Man, a novel that ...","[literature, religion]","[art (arts, history of arts, performing arts, ...",1,True


In [54]:
from sklearn.preprocessing import MultiLabelBinarizer
true_labels = list(data.final_disciplines)
predicted_labels = list(data.chatgpt)
mlb = MultiLabelBinarizer()
true_label_binarized = mlb.fit_transform(true_labels)
predicted_labels_binarized = mlb.transform(predicted_labels)
from sklearn.metrics import classification_report
print(classification_report(true_label_binarized, predicted_labels_binarized, target_names=mlb.classes_))

                                                                         precision    recall  f1-score   support

                                   agriculture, forestry, and fisheries       0.28      0.55      0.37        56
                                               animal and dairy science       0.77      0.73      0.75        45
                                                            archaeology       0.79      0.56      0.66        41
                    art (arts, history of arts, performing arts, music)       0.53      0.28      0.37        57
                                                         basic medicine       0.57      0.04      0.07       100
                                                    biological sciences       0.66      0.53      0.59       194
                                                   chemical engineering       0.28      0.62      0.39        45
                                                      chemical sciences       0.57      0.19   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [55]:
from sklearn.metrics import accuracy_score
accuracy_score(true_label_binarized, predicted_labels_binarized)

0.10197578075207138

In [56]:
def filter_labels(labels_list):
    filtered_labels = []
    for labels in labels_list:
        filtered_labels.append([label for label in labels if not label.startswith('other')])
    return filtered_labels

# Filter both true and predicted labels
true_labels_filtered = filter_labels(list(data.final_disciplines))
predicted_labels_filtered = filter_labels(list(data.chatgpt))

# Initialize MultiLabelBinarizer
mlb = MultiLabelBinarizer()

# Transform labels
true_label_binarized = mlb.fit_transform(true_labels_filtered)
predicted_labels_binarized = mlb.transform(predicted_labels_filtered)

# Generate and print classification report
print(classification_report(true_label_binarized, predicted_labels_binarized, target_names=mlb.classes_))

                                                                         precision    recall  f1-score   support

                                   agriculture, forestry, and fisheries       0.28      0.55      0.37        56
                                               animal and dairy science       0.77      0.73      0.75        45
                                                            archaeology       0.79      0.56      0.66        41
                    art (arts, history of arts, performing arts, music)       0.53      0.28      0.37        57
                                                         basic medicine       0.57      0.04      0.07       100
                                                    biological sciences       0.66      0.53      0.59       194
                                                   chemical engineering       0.28      0.62      0.39        45
                                                      chemical sciences       0.57      0.19   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
