In [1]:
import pandas as pd
from ast import literal_eval

In [2]:
import json
with open('discipline_structure.json', 'r') as f:
    data = json.load(f)
data_simplified = {key1: list(value.keys()) for key1, value in data.items()}
data_simplified_reversed = {el:key for key, value in data_simplified.items() for el in value}

def get_area(discipline):
    discipline = discipline.replace('Art', 'Arts').replace('art ', 'arts ').replace('Eartsh', 'Earth')
    return data_simplified_reversed[discipline.lower()] if discipline.lower() in data_simplified_reversed else discipline

In [3]:
def describe_area(df: pd.DataFrame, train=False, limit=500000):
    df_exploded = df.explode('final_disciplines')
    df_exploded['area'] = df_exploded.final_disciplines.apply(get_area)
    total_disciplines_over_threshold = 0
    for area in df_exploded.area.unique():
        print(f'---------------------- \n {area}')
        area_df = df_exploded[df_exploded.area == area].drop_duplicates()        
        print(f'Disciplines: {list(area_df.final_disciplines.unique())}')
        grouped_area = area_df.groupby('final_disciplines').agg({'ABSTRACT': 'count', 'len_final': 'mean'})
        print(f'Count information \n {grouped_area.ABSTRACT.describe()}')
        print(f'Nb labels information \n {grouped_area.len_final.describe()}')
        if train:
            disciplines_over_threshold = grouped_area[grouped_area.ABSTRACT >= limit].shape[0]
            total_disciplines_over_threshold += disciplines_over_threshold
            print(f'Nb disciplines that get to the limit: {disciplines_over_threshold}')
    print(f'Total disciplines over threshold: {total_disciplines_over_threshold}')

def prep_train(df):
    df['label'] = df.label.apply(literal_eval)
    df['len_final'] = df.label.apply(len)
    return df.rename(columns={'text': 'ABSTRACT', 'label':'final_disciplines'})

## Total data

In [17]:
total = pd.read_pickle('data_to_be_split.pickle')

In [18]:
total

Unnamed: 0,ABSTRACT,final_disciplines,len_final
0,"In recent decades, racial disparities in K-12 ...","[Psychology, Sociology, Educational sciences, ...",4
1,A medium ash coal is used for the study. Four ...,"[Chemical engineering, Earth and related envir...",4
2,The separation of coal and gangue is an import...,"[Computer and information sciences, Electrical...",5
3,"Two different stockpiles, one loose and anothe...","[Environmental engineering, Chemical engineeri...",5
4,Family leisure is a context in which individua...,"[Other social sciences, Sociology, Media and c...",4
...,...,...,...
2119147,"The Pacific Ocean is the largest in the world,...",[Biological sciences],1
2119148,We have extended the Helfrich's spontaneous cu...,"[Physical sciences, Mathematics, Chemical scie...",3
2119149,Modern cosmology is broadly based on the Cosmo...,[Physical sciences],1
2119150,Using the data from the Chinese Household Inco...,[Economics and business],1


In [19]:
total.len_final.describe()

count    2.077486e+06
mean     2.412943e+00
std      1.158512e+00
min      1.000000e+00
25%      1.000000e+00
50%      2.000000e+00
75%      3.000000e+00
max      5.000000e+00
Name: len_final, dtype: float64

In [50]:
describe_area(total)

---------------------- 
 social sciences
Disciplines: ['Psychology', 'Sociology', 'Educational sciences', 'Law', 'Other social sciences', 'Media and communications', 'Economics and business', 'Social and economic geography', 'Political Science']
Count information 
 count         9.000000
mean      54498.444444
std       47016.739009
min       10588.000000
25%       18050.000000
50%       35842.000000
75%       70671.000000
max      130365.000000
Name: ABSTRACT, dtype: float64
Nb labels information 
 count    9.000000
mean     3.047019
std      0.247800
min      2.730286
25%      2.796560
50%      3.029940
75%      3.205651
max      3.397027
Name: len_final, dtype: float64
---------------------- 
 engineering and technology
Disciplines: ['Chemical engineering', 'Environmental engineering', 'Electrical engineering, electronic engineering, information engineering', 'Materials engineering', 'Other engineering and technologies', 'Nano-technology', 'Environmental biotechnology', 'Mechanical 

## Train 100k

In [35]:
train100k = pd.read_csv('different_size/train100000.csv')

In [37]:
train100k = prep_train(train100k)

In [39]:
train100k.len_final.describe()

count    736432.000000
mean          2.761121
std           1.180125
min           1.000000
25%           2.000000
50%           3.000000
75%           4.000000
max          22.000000
Name: len_final, dtype: float64

In [51]:
describe_area(train100k, True, 100000)

---------------------- 
 social sciences
Disciplines: ['Psychology', 'Educational sciences', 'Sociology', 'Social and economic geography', 'Political Science', 'Other social sciences', 'Economics and business', 'Law', 'Media and communications']
Count information 
 count        9.000000
mean     32703.222222
std      28215.845290
min       6352.000000
25%      10828.000000
50%      21507.000000
75%      42412.000000
max      78235.000000
Name: ABSTRACT, dtype: float64
Nb labels information 
 count    9.000000
mean     3.030681
std      0.244368
min      2.732302
25%      2.784857
50%      2.998898
75%      3.189215
max      3.386410
Name: len_final, dtype: float64
Nb disciplines that get to the limit: 0
---------------------- 
 medical and health sciences
Disciplines: ['Health sciences', 'Basic medicine', 'Clinical medicine', 'Health biotechnology']
Count information 
 count         4.000000
mean      84315.500000
std       57317.425102
min        1023.000000
25%       75234.750000
50%

## Train 10k

In [52]:
train10k = pd.read_csv('different_size/train10000.csv')

In [53]:
train10k = prep_train(train10k)

In [54]:
train10k.len_final.describe()

count    158433.000000
mean          2.910606
std           1.254986
min           1.000000
25%           2.000000
50%           3.000000
75%           4.000000
max          17.000000
Name: len_final, dtype: float64

In [56]:
describe_area(train10k, True, 10000)

---------------------- 
 humanities
Disciplines: ['History', 'Languages and linguistics', 'Art (arts, history of arts, performing arts, music)', 'Philosophy and ethics', 'Religion', 'Other humanities', 'Literature', 'Archaeology']
Count information 
 count       8.000000
mean     4212.500000
std      2945.159418
min      1500.000000
25%      1885.250000
50%      3023.000000
75%      6404.000000
max      9563.000000
Name: ABSTRACT, dtype: float64
Nb labels information 
 count    8.000000
mean     2.854105
std      0.242007
min      2.556833
25%      2.660722
50%      2.825817
75%      3.088064
max      3.151151
Name: len_final, dtype: float64
Nb disciplines that get to the limit: 0
---------------------- 
 social sciences
Disciplines: ['Educational sciences', 'Sociology', 'Political Science', 'Other social sciences', 'Psychology', 'Social and economic geography', 'Economics and business', 'Law', 'Media and communications']
Count information 
 count        9.000000
mean     11871.777778


## Train 1k

In [58]:
train1k = pd.read_csv('different_size/train1000.csv')

In [59]:
train1k = prep_train(train1k)

In [60]:
train1k.len_final.describe()

count    20027.000000
mean         3.002746
std          1.256141
min          1.000000
25%          2.000000
50%          3.000000
75%          4.000000
max         10.000000
Name: len_final, dtype: float64

In [61]:
describe_area(train1k, True, 1000)

---------------------- 
 medical and health sciences
Disciplines: ['Clinical medicine', 'Health sciences', 'Basic medicine', 'Health biotechnology']
Count information 
 count       4.000000
mean     1721.750000
std       624.741213
min      1000.000000
25%      1414.000000
50%      1692.500000
75%      2000.250000
max      2502.000000
Name: ABSTRACT, dtype: float64
Nb labels information 
 count    4.000000
mean     3.772768
std      0.165600
min      3.562500
25%      3.679294
50%      3.810113
75%      3.903587
max      3.908347
Name: len_final, dtype: float64
Nb disciplines that get to the limit: 4
---------------------- 
 humanities
Disciplines: ['Philosophy and ethics', 'Art (arts, history of arts, performing arts, music)', 'Other humanities', 'Literature', 'History', 'Religion', 'Languages and linguistics', 'Archaeology']
Count information 
 count       8.000000
mean     1076.000000
std        28.953164
min      1011.000000
25%      1073.250000
50%      1088.500000
75%      1091.7

## Test

In [62]:
test = pd.read_csv('test_new.csv')

In [63]:
test = prep_train(test)

In [64]:
test.len_final.describe()

count    415262.000000
mean          2.379009
std           1.147415
min           1.000000
25%           1.000000
50%           2.000000
75%           3.000000
max          13.000000
Name: len_final, dtype: float64

In [66]:
describe_area(test)

---------------------- 
 natural sciences
Disciplines: ['Computer and information sciences', 'Earth and related environmental sciences', 'Biological sciences', 'Chemical sciences', 'Physical sciences', 'Mathematics']
Count information 
 count         6.000000
mean      59498.000000
std       28896.699597
min       22052.000000
25%       41015.000000
50%       59592.500000
75%       73724.000000
max      102494.000000
Name: ABSTRACT, dtype: float64
Nb labels information 
 count    6.000000
mean     2.859646
std      0.295945
min      2.470116
25%      2.616009
50%      2.961145
75%      3.055365
max      3.177733
Name: len_final, dtype: float64
---------------------- 
 social sciences
Disciplines: ['Other social sciences', 'Economics and business', 'Media and communications', 'Psychology', 'Educational sciences', 'Sociology', 'Social and economic geography', 'Political Science', 'Law']
Count information 
 count        9.000000
mean     10902.777778
std       9406.322817
min       2117.0

## ChatGPT10

In [4]:
chatgpt10 = pd.read_csv('data_gpt10.csv')

In [6]:
chatgpt10['final_disciplines'] = chatgpt10.final_disciplines.apply(lambda x: x.split(';'))

In [8]:
chatgpt10['len_final'] = chatgpt10.final_disciplines.apply(len)

In [11]:
chatgpt10.len_final.describe()

count    191.000000
mean       3.219895
std        1.144240
min        1.000000
25%        2.000000
50%        3.000000
75%        4.000000
max        5.000000
Name: len_final, dtype: float64

In [12]:
describe_area(chatgpt10)

---------------------- 
 natural sciences
Disciplines: ['Mathematics', 'Biological sciences', 'Earth and related environmental sciences', 'Chemical sciences', 'Physical sciences', 'Computer and information sciences']
Count information 
 count     6.000000
mean     20.500000
std       8.312641
min      13.000000
25%      14.750000
50%      17.000000
75%      26.750000
max      32.000000
Name: ABSTRACT, dtype: float64
Nb labels information 
 count    6.000000
mean     3.757345
std      0.483119
min      3.000000
25%      3.560202
50%      3.746606
75%      4.086538
max      4.352941
Name: len_final, dtype: float64
---------------------- 
 engineering and technology
Disciplines: ['Environmental engineering', 'Other engineering and technologies', 'Electrical engineering, electronic engineering, information engineering', 'Environmental biotechnology', 'Civil engineering', 'Chemical engineering', 'Mechanical engineering', 'Materials engineering', 'Medical engineering', 'Industrial Biotechnol

## ChatGPT50

In [4]:
chatgpt50 = pd.read_csv('data_gpt50.csv')

In [5]:
chatgpt50['final_disciplines'] = chatgpt50.final_disciplines.apply(lambda x: x.split(';'))

In [6]:
chatgpt50['len_final'] = chatgpt50.final_disciplines.apply(len)

In [7]:
chatgpt50.len_final.describe()

count    1569.000000
mean        1.964946
std         0.966622
min         1.000000
25%         1.000000
50%         2.000000
75%         2.000000
max         5.000000
Name: len_final, dtype: float64

In [8]:
describe_area(chatgpt50)

---------------------- 
 humanities
Disciplines: ['Other humanities', 'Archaeology', 'History', 'Philosophy and ethics', 'Art (arts, history of arts, performing arts, music)', 'Languages and linguistics', 'Literature', 'Religion']
Count information 
 count     8.000000
mean     52.750000
std       9.207916
min      41.000000
25%      46.500000
50%      52.000000
75%      57.250000
max      70.000000
Name: ABSTRACT, dtype: float64
Nb labels information 
 count    8.000000
mean     2.201953
std      0.138094
min      1.975610
25%      2.137943
50%      2.185266
75%      2.303571
max      2.385965
Name: len_final, dtype: float64
---------------------- 
 engineering and technology
Disciplines: ['Environmental engineering', 'Chemical engineering', 'Mechanical engineering', 'Other engineering and technologies', 'Electrical engineering, electronic engineering, information engineering', 'Materials engineering', 'Civil engineering', 'Nano-technology', 'Environmental biotechnology', 'Medical eng