In [30]:
import json
import pandas as pd

# Load the JSON data
with open('results/ChatGPT/intermediate_results/chatgpt_results_hierarchical2_description_title.json', 'r') as file:
    data = json.load(file)

### Option 1: total score > 0.5 and max 3 disciplines per area

In [33]:
import json
import pandas as pd

# Assuming the JSON data is loaded as shown previously

# Initialize a list to store the results
results = []

# Iterate through each PUBID and its categories
for pubid, categories in data.items():
    all_disciplines = []
    highest_score_discipline = (None, 0)  # Tuple to track the discipline with the highest score and its score

    # Process each category
    for category, details in categories.items():
        category_score = float(details['score']) / 100
        sorted_disciplines = sorted(details['disciplines'].items(), key=lambda x: category_score * float(x[1]) / 100, reverse=True)

        added_per_category = 0
        for discipline, score in sorted_disciplines:
            final_score = category_score * float(score) / 100
            if final_score > highest_score_discipline[1]:  # Update the highest scoring discipline if needed
                highest_score_discipline = (discipline, final_score)
            if final_score > 0.5 and added_per_category < 3:
                all_disciplines.append((discipline, final_score, category))
                added_per_category += 1

    # Enforce the total limit of 5 disciplines per PUBID
    all_disciplines_sorted = sorted(all_disciplines, key=lambda x: x[1], reverse=True)

    retained_disciplines = []
    category_counts = {}
    for discipline, _, category in all_disciplines_sorted:
        if len(retained_disciplines) < 5:
            if category_counts.get(category, 0) < 3:
                retained_disciplines.append(discipline)
                category_counts[category] = category_counts.get(category, 0) + 1

    # If no discipline passes the threshold, add the one with the highest score
    if not retained_disciplines and highest_score_discipline[0]:
        retained_disciplines.append(highest_score_discipline[0])

    # Append the result
    results.append({'PUBID': pubid, 'chatgpt': retained_disciplines})

# Convert the results to a DataFrame
results_df = pd.DataFrame(results)

### Option 2. If area score > 0.7, retain the top discipline of the area independent of its score.

In [5]:
import json
import pandas as pd

# Initialize a list to store the results
results = []

# Iterate through each PUBID and its categories
for pubid, categories in data.items():
    all_disciplines = []
    highest_score_discipline_backup = (None, 0)  # Backup: highest score overall
    highest_score_disciplines_per_category = []  # For categories with score > 0.7

    # Process each category
    for category, details in categories.items():
        category_score = float(details['score']) / 100

        # Identify the discipline with the highest score in this category
        highest_score_discipline_in_category = max(details['disciplines'].items(), key=lambda x: float(x[1]), default=(None, 0))

        # Calculate the combined score for the highest score discipline in this category
        discipline_score = float(highest_score_discipline_in_category[1]) / 100 if highest_score_discipline_in_category[0] else 0
        combined_score_highest_discipline = category_score * discipline_score
        
        # Update backup discipline if this one has the highest combined score so far
        if combined_score_highest_discipline > highest_score_discipline_backup[1]:
            highest_score_discipline_backup = (highest_score_discipline_in_category[0], combined_score_highest_discipline)

        # Keep the discipline if category score > 0.7
        if category_score > 0.7:
            highest_score_disciplines_per_category.append((highest_score_discipline_in_category[0], combined_score_highest_discipline))

        # Additionally, add all disciplines with a combined score > 0.7
        for discipline, score in details['disciplines'].items():
            final_score = category_score * float(score) / 100
            if final_score > 0.6:
                all_disciplines.append((discipline, final_score))

    # Combine the selected disciplines, prioritizing unique entries
    selected_disciplines = set([discipline for discipline, _ in highest_score_disciplines_per_category])
    selected_disciplines.update([discipline for discipline, _ in all_disciplines if discipline not in selected_disciplines])

    # If no discipline is selected based on the criteria, use the backup discipline
    if not selected_disciplines and highest_score_discipline_backup[0]:
        selected_disciplines.add(highest_score_discipline_backup[0])

    # Limit to 5 disciplines at most
    selected_disciplines = list(selected_disciplines)[:5]

    # Append the result
    results.append({'PUBID': pubid, 'chatgpt': list(selected_disciplines)})

# Convert the results to a DataFrame
results_df = pd.DataFrame(results)
results_df

Unnamed: 0,PUBID,chatgpt
0,486688800001,[mathematics]
1,490689400001,"[agriculture, forestry, and fisheries, earth a..."
2,494013500001,[political science]
3,497937300001,"[biological sciences, animal and dairy science..."
4,500234900001,"[literature, other engineering and technologie..."
...,...,...
185,843237200001,"[educational sciences, psychology, health scie..."
186,866351500058,[clinical medicine]
187,871947700028,"[other social sciences, archaeology, sociology]"
188,693728100001,"[economics and business, history, social and e..."


In [6]:
results_df['prediction_length'] = results_df.chatgpt.apply(len)

In [7]:
results_df.prediction_length.value_counts()

prediction_length
1    62
2    44
3    31
5    29
4    24
Name: count, dtype: int64

In [8]:
import pandas as pd
test_data = pd.read_csv('H:/data/WoS_data/data_gpt50.csv', usecols=['PUBID', 'ABSTRACT', 'final_disciplines'], dtype=str)

In [9]:
test_data.head(20)

Unnamed: 0,PUBID,ABSTRACT,final_disciplines
0,485845700001,"Over the past three decades, along with the ex...",Other humanities
1,490797000001,This paper reports on CFD simulations of in-cy...,Environmental engineering;Chemical engineering...
2,494419600001,The demand for lean protein is the main reason...,"Electrical engineering, electronic engineering..."
3,513452400001,"In this article, 119 faculty members from 10 S...",Educational sciences
4,514500800001,Co-teaching is a widely used service delivery ...,Educational sciences;Health sciences
5,534713700004,Mindfulness has shown beneficial relationships...,Psychology;Basic medicine
6,536595900001,We examine public perceptions of the police co...,Law
7,548588800001,This study explores how televised football in ...,Media and communications;Sociology;Other socia...
8,549312100001,Petrographic analysis was conducted on pottery...,Archaeology;Other social sciences
9,550529800001,Local governments nationwide have been adoptin...,Economics and business;Social and economic geo...


In [10]:
test_data[test_data.PUBID == '579654000001'].ABSTRACT.values

array(['Tourism research on the LGBTIQ + communities has grown over the years, entering mainstream discussions as a segment of interest. This growing focus reflects greater societal acceptance and acknowledgement of the systemic inequalities that challenge their rights. The landscape of current scholarship, though important to academic literature, policy and practice, has not been explored. On this premise, and under the umbrella of social sustainability, a systematic qualitative review of scholarship on the LGBTIQ + community and tourism was conducted with Q1- and Q2-ranked travel and tourism journals (Scimago Journal & Country Rank) as a basis. Articles were analysed to identify the sampling parameters and their topic foci. The findings suggest the literature focuses on sexually diverse groups (gays and lesbians) who are open about their identity, with limited consideration given to bisexual or gender diverse travellers (intersex and transgender). The topics and language used have al

In [11]:
test_data = test_data.merge(results_df, on='PUBID')

In [22]:
test_data['final_disciplines'] = test_data.final_disciplines.str.lower()
replace_dict = {
    'arts (arts, history of arts, performing arts, music)':'art (arts, history of arts, performing arts, music)'
}
test_data['chatgpt'] = test_data.chatgpt.apply(lambda x: list(set(replace_dict.get(y,y) for y in x)))
data = test_data

In [23]:
data['final_disciplines'] = data.final_disciplines.apply(lambda x: x.split(';'))

AttributeError: 'float' object has no attribute 'split'

In [24]:
data['arts'] = data.chatgpt.apply(lambda x: any(el.startswith('art') for el in x))

In [25]:
data[data.arts]

Unnamed: 0,PUBID,ABSTRACT,final_disciplines,chatgpt,prediction_length,arts


In [26]:
from sklearn.preprocessing import MultiLabelBinarizer
true_labels = list(data.final_disciplines)
predicted_labels = list(data.chatgpt)
mlb = MultiLabelBinarizer()
true_label_binarized = mlb.fit_transform(true_labels)
predicted_labels_binarized = mlb.transform(predicted_labels)
from sklearn.metrics import classification_report
print(classification_report(true_label_binarized, predicted_labels_binarized, target_names=mlb.classes_))

TypeError: 'float' object is not iterable

In [27]:
from sklearn.metrics import accuracy_score
accuracy_score(true_label_binarized, predicted_labels_binarized)

1.0

In [28]:
mlb.classes_

AttributeError: 'MultiLabelBinarizer' object has no attribute 'classes_'

In [29]:
def filter_labels(labels_list):
    filtered_labels = []
    for labels in labels_list:
        filtered_labels.append([label for label in labels if not label.startswith('other')])
    return filtered_labels

# Filter both true and predicted labels
true_labels_filtered = filter_labels(list(data.final_disciplines))
predicted_labels_filtered = filter_labels(list(data.chatgpt))

# Initialize MultiLabelBinarizer
mlb = MultiLabelBinarizer()

# Transform labels
true_label_binarized = mlb.fit_transform(true_labels_filtered)
predicted_labels_binarized = mlb.transform(predicted_labels_filtered)

# Generate and print classification report
print(classification_report(true_label_binarized, predicted_labels_binarized, target_names=mlb.classes_))

TypeError: 'float' object is not iterable

In [20]:
data[['ABSTRACT', 'final_disciplines', 'chatgpt']]

Unnamed: 0,ABSTRACT,final_disciplines,chatgpt
0,"This essay examines how, through narrating the...","[literature, other humanities]","[literature, economics and business]"
