In [56]:
import pandas as pd
import string, re
from pathlib import Path
from collections import Counter

## Import Web of Science Categories & Groups List

In [57]:
categ_df = pd.read_csv("../data/JCR_CategoriesResults_04_2024.csv", encoding = "utf-8")
print(categ_df.shape)
categ_df.head()

(513, 7)


Unnamed: 0,Category,Group,Edition,# of journals,Citable Items,Total Citations,Median impact factor
0,ACOUSTICS,Physics,SCIE,31,5318,260308,2.3
1,ACOUSTICS,Physics,ESCI,9,415,2675,0.7
2,AGRICULTURAL ECONOMICS & POLICY,Agricultural Sciences;Economics & Business,SCIE,22,1170,57764,3.3
3,AGRICULTURAL ECONOMICS & POLICY,Agricultural Sciences;Economics & Business,ESCI,16,796,7349,1.2
4,AGRICULTURAL ENGINEERING,Agricultural Sciences;Engineering,SCIE,15,4338,306990,2.2


In [58]:
categ_df.tail()

Unnamed: 0,Category,Group,Edition,# of journals,Citable Items,Total Citations,Median impact factor
508,WATER RESOURCES,Engineering;Environment/Ecology;Geosciences,ESCI,28,1820,18516,1.7
509,WOMENS STUDIES,"Social Sciences, General",SSCI,46,2181,93808,1.6
510,WOMENS STUDIES,"Social Sciences, General",ESCI,18,338,2994,0.4
511,ZOOLOGY,Biology & Biochemistry;Environment/Ecology;Pla...,SCIE,177,11489,464470,1.3
512,ZOOLOGY,Biology & Biochemistry;Environment/Ecology;Pla...,ESCI,5,229,2007,0.7


In [59]:
# need to convert Citabe Items and Total Citations columns into int

categ_df['Citable Items'] = categ_df['Citable Items'].str.replace(",", "").astype(int)
categ_df['Total Citations'] = categ_df['Total Citations'].str.replace(",", "").astype(int)

In [60]:
# combine SCIE & ESCI categories
categ_df = categ_df.groupby(["Category", "Group"]).agg({"# of journals": "sum", "Citable Items": "sum", 
                                             "Total Citations": "sum"})
print(categ_df.shape)
# convert Category and Group indices back to columns
categ_df = categ_df.reset_index()

(254, 3)


In [61]:
outputdir = Path("../data")
categ_df.to_csv(Path(outputdir, "JCR_CategoriesResults_groups.csv"), encoding = "utf-8")

In [54]:
# split delimited string of research groups into lists
categ_df["Group"] = categ_df["Group"].fillna("")
categ_df["Group"] = categ_df.loc[:, "Group"].str.split(";")
categ_df["Group"] = categ_df["Group"].apply(lambda x: [item.strip() for item in x])

In [55]:
# export category df with list of groups
outputdir = Path("../data")
categ_df.to_csv(Path(outputdir, "JCR_CategoriesResults_grouplists.csv"))

In [46]:
categ_df_explode = categ_df.explode("Group")
print(categ_df_explode.shape)
categ_df_explode.head()

(448, 5)


Unnamed: 0,Category,Group,# of journals,Citable Items,Total Citations
0,ACOUSTICS,Physics,40,5733,262983
1,AGRICULTURAL ECONOMICS & POLICY,Agricultural Sciences,38,1966,65113
1,AGRICULTURAL ECONOMICS & POLICY,Economics & Business,38,1966,65113
2,AGRICULTURAL ENGINEERING,Agricultural Sciences,18,4611,308328
2,AGRICULTURAL ENGINEERING,Engineering,18,4611,308328


In [47]:
categ_df_explode["Category"].value_counts()

Category
HISTORY & PHILOSOPHY OF SCIENCE              4
AREA STUDIES                                 4
COMMUNICATION                                4
AGRICULTURE, MULTIDISCIPLINARY               4
AGRONOMY                                     4
                                            ..
LITERATURE                                   1
LITERATURE, AFRICAN, AUSTRALIAN, CANADIAN    1
LITERATURE, AMERICAN                         1
LITERATURE, BRITISH ISLES                    1
LITERARY THEORY & CRITICISM                  1
Name: count, Length: 254, dtype: int64

In [48]:
categ_df_explode["Group"].value_counts()

Group
Clinical Medicine                       59
Social Sciences, General                41
Engineering                             41
Multidisciplinary                       36
Physics                                 34
Biology & Biochemistry                  34
Economics & Business                    21
Chemistry                               21
Plant & Animal Science                  17
Materials Science                       17
Literature & Language                   17
Psychiatry/Psychology                   16
Computer Science                        14
Geosciences                             14
Environment/Ecology                     13
Mathematics                             12
Visual & Performing Arts                10
History & Archaeology                    9
Arts & Humanities, Interdisciplinary     8
Agricultural Sciences                    7
Philosophy & Religion                    7
Name: count, dtype: int64

### Things to do

1. if I apply this to a WoS dataset it could get a little complicated:
    + each WoS paper can have multiple WC categories
    + each WC category could have multiple groups
    + thus a paper with 3 WC categories could be explode out to 8 different groups (although several could be duplicates that would need to be re-aggregated)
2. possible procedure:
    + explode WoS documents dataset by WC category
    + map list of WoS groups to each WC category in exploded WoS dataset
    + explode the WoS groups
    + remove duplicate WoS groups for one paper

