# Importing text files exported from Web of Science Database and Compiling them into a single dataset (and saving as a .csv)

## 1. Import required packages

In [21]:
from pathlib import Path   #for working with filepaths
import pandas as pd        #for creating and working with dataframes

## 2. Set path to directory containing WoS Text Files

In [22]:
#check your current working directory
Path.cwd()

WindowsPath('c:/Users/F0040RP/Documents/DartLib_RDS/projects/bibliometric-analysis/WoS')

In [23]:
#pathdir = Path.cwd()                        #use this if tsv .txt files are in current working directory
pathdir = Path("resilience/.txt files")                 #use this to set relative path to .txt files if they are not in cwd

#pathdir = Path("WoS/resilience")         
pathlist = sorted(pathdir.glob('*.txt'))    
[path.name for path in pathlist]

['resilience10001-10500.txt',
 'resilience1001-1500.txt',
 'resilience10501-11000.txt',
 'resilience11001-11500.txt',
 'resilience11501-12000.txt',
 'resilience12001-12500.txt',
 'resilience12501-13000.txt',
 'resilience13001-13500.txt',
 'resilience13501-14000.txt',
 'resilience14001-14500.txt',
 'resilience14501-15000.txt',
 'resilience15001-15500.txt',
 'resilience1501-2000.txt',
 'resilience15501-16000.txt',
 'resilience16001-16500.txt',
 'resilience16501-17000.txt',
 'resilience17001-17500.txt',
 'resilience17501-18000.txt',
 'resilience18001-18500.txt',
 'resilience18501-19000.txt',
 'resilience19001-19500.txt',
 'resilience19501-20000.txt',
 'resilience1_500.txt',
 'resilience20001-20500.txt',
 'resilience2001-2500.txt',
 'resilience20501-21000.txt',
 'resilience21001-21500.txt',
 'resilience21501-22000.txt',
 'resilience22001-22500.txt',
 'resilience22501-23000.txt',
 'resilience23001-23500.txt',
 'resilience23501-24000.txt',
 'resilience24001-24500.txt',
 'resilience24501-2500

## 3. Read in tab-separated-value text files and combine into one dataframe

In [24]:
datalist = []                           #creates an empty list
for i, path in enumerate(pathlist):
    df = pd.read_csv(path, sep = "\t")
    print("Reading & appending file number:", i, "(with %s rows) of" %df.shape[0], len(pathlist), "total files. Pathname: ", path.stem)
    datalist.append(df)    ## appends each imported dataframe into a list of dataframes
    
data = pd.concat(datalist)        #concatenates or joins each dataframe in datalist into one dataframe

Reading & appending file number: 0 (with 500 rows) of 200 total files. Pathname:  resilience10001-10500
Reading & appending file number: 1 (with 500 rows) of 200 total files. Pathname:  resilience1001-1500
Reading & appending file number: 2 (with 500 rows) of 200 total files. Pathname:  resilience10501-11000
Reading & appending file number: 3 (with 500 rows) of 200 total files. Pathname:  resilience11001-11500
Reading & appending file number: 4 (with 500 rows) of 200 total files. Pathname:  resilience11501-12000
Reading & appending file number: 5 (with 500 rows) of 200 total files. Pathname:  resilience12001-12500
Reading & appending file number: 6 (with 500 rows) of 200 total files. Pathname:  resilience12501-13000
Reading & appending file number: 7 (with 500 rows) of 200 total files. Pathname:  resilience13001-13500
Reading & appending file number: 8 (with 500 rows) of 200 total files. Pathname:  resilience13501-14000
Reading & appending file number: 9 (with 500 rows) of 200 total fi

## 4. Get summary information for the new dataset

In [25]:
print(data.shape)
data.head(3)

(99758, 71)


Unnamed: 0,PT,AU,BA,BE,GP,AF,BF,CA,TI,SO,...,WC,WE,SC,GA,PM,OA,HC,HP,DA,UT
0,J,"Sun, XY; Dai, XY; Yang, TS; Song, HT; Yang, JL...",,,,"Sun, Xinyang; Dai, Xuyan; Yang, Tingshu; Song,...",,,Effects of mental resilience on neuroendocrine...,ENDOCRINE,...,Endocrinology & Metabolism,Science Citation Index Expanded (SCI-EXPANDED),Endocrinology & Metabolism,AU1UG,24633577.0,,,,2023-08-05,WOS:000345404900029
1,J,"Ladanyi, A; Cinkler, T",,,,"Ladanyi, Akos; Cinkler, Tibor",,,Resilience-throughput-power trade-off in futur...,PHOTONIC NETWORK COMMUNICATIONS,...,"Computer Science, Information Systems; Optics;...",Science Citation Index Expanded (SCI-EXPANDED),Computer Science; Optics; Telecommunications,HX8BE,,hybrid,,,2023-08-05,WOS:000467630000004
2,J,"Schwalm, FD; Zandavalli, RB; de Castro, ED; Lu...",,,,"Schwalm, Fabio Duarte; Zandavalli, Rafaela Bru...",,,Is there a relationship between spirituality/r...,JOURNAL OF HEALTH PSYCHOLOGY,...,"Psychology, Clinical",Social Science Citation Index (SSCI),Psychology,0F9KM,33499688.0,,,,2023-08-05,WOS:000628934600001


In [26]:
data.info()
# can also try:
## data.describe()
## data.tail()

<class 'pandas.core.frame.DataFrame'>
Index: 99758 entries, 0 to 499
Data columns (total 71 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   PT      99758 non-null  object 
 1   AU      99604 non-null  object 
 2   BA      943 non-null    object 
 3   BE      8570 non-null   object 
 4   GP      5412 non-null   object 
 5   AF      99604 non-null  object 
 6   BF      943 non-null    object 
 7   CA      375 non-null    object 
 8   TI      99758 non-null  object 
 9   SO      99758 non-null  object 
 10  SE      9335 non-null   object 
 11  BS      37 non-null     object 
 12  LA      99256 non-null  object 
 13  DT      99744 non-null  object 
 14  CT      13655 non-null  object 
 15  CY      13653 non-null  object 
 16  CL      13653 non-null  object 
 17  SP      10597 non-null  object 
 18  HO      1965 non-null   object 
 19  DE      75498 non-null  object 
 20  ID      72131 non-null  object 
 21  AB      88773 non-null  object 
 22  C1   

## 5. Export full dataframe into a .csv file

Skip to step #6 if you only want to export a subsetted version of the dataset.

In [27]:
outputdir = Path("data")
data.to_csv(Path(outputdir,"merged_wos_files.csv"), encoding = "utf-8")

## 6. Subset dataframe and then export

Often, the Web of Science database provides more data fields than we need. We can work with a smaller version of the dataset by only keeping those columns we really want. 

In [28]:
data.columns

Index(['PT', 'AU', 'BA', 'BE', 'GP', 'AF', 'BF', 'CA', 'TI', 'SO', 'SE', 'BS',
       'LA', 'DT', 'CT', 'CY', 'CL', 'SP', 'HO', 'DE', 'ID', 'AB', 'C1', 'C3',
       'RP', 'EM', 'RI', 'OI', 'FU', 'FP', 'FX', 'CR', 'NR', 'TC', 'Z9', 'U1',
       'U2', 'PU', 'PI', 'PA', 'SN', 'EI', 'BN', 'J9', 'JI', 'PD', 'PY', 'VL',
       'IS', 'PN', 'SU', 'SI', 'MA', 'BP', 'EP', 'AR', 'DI', 'DL', 'D2', 'EA',
       'PG', 'WC', 'WE', 'SC', 'GA', 'PM', 'OA', 'HC', 'HP', 'DA', 'UT'],
      dtype='object')

Review the data fields (columns in this case) available for Web of Science data. You can see a [full List of WoS data fields here.](https://docs.google.com/spreadsheets/d/1KPNVIrhwZJrqYOsu3jzpRF7pzHCjRVy6K8eu3qTu-cA/edit#gid=1397269035) 

Then, choose which columns you would like to keep by placing the two-letter column name in the list below.

*Place this information somewhere public so I don't have to link to an institutional drive!!*

In [29]:
cols_to_keep = ["PT", #pub type
                "AU", "AF", #author / author full names
                "TI",  #author title
                "SO",  #source title
                "LA",  #language
                "DT",   #document type
                "DE", "ID",  #author keywords / keywords plus
                "AB",      #abstract
                "RI", "OI",  #research ids / ORCIDs
                "CR", #cited references
                "TC", "Z9", "U1", "U2", #times cited (WoS core) / times cited, all / 180 days usage ct / since 2013 usage count
                "HC", "HP", #highly cited status / hot paper status
                "PU",  #publisher
                "SN", "EI", "BN", "DI", "UT",   #ISSN / eISSN / ISBN / DOI / WoS id
                "JI", #journal ISO abbreviation
                "PD", "PY",   #pub data / pub year
                "WC", "SC"  #WoS Categories / Research Areas
                ]

Next, we can create a new dataframe ("subdata") with only the columns we want.

In [30]:
subdata = data.loc[:, cols_to_keep]
print(subdata.shape)   #print the new dimensions of the dataframe
subdata.head(2)

(99758, 30)


Unnamed: 0,PT,AU,AF,TI,SO,LA,DT,DE,ID,AB,...,SN,EI,BN,DI,UT,JI,PD,PY,WC,SC
0,J,"Sun, XY; Dai, XY; Yang, TS; Song, HT; Yang, JL...","Sun, Xinyang; Dai, Xuyan; Yang, Tingshu; Song,...",Effects of mental resilience on neuroendocrine...,ENDOCRINE,English,Article,Mental resilience; Sleep deprivation; Rennin; ...,CORTISOL,The aim of this study was to investigate the e...,...,1355-008X,1559-0100,,10.1007/s12020-014-0228-8,WOS:000345404900029,Endocrine,DEC,2014,Endocrinology & Metabolism,Endocrinology & Metabolism
1,J,"Ladanyi, A; Cinkler, T","Ladanyi, Akos; Cinkler, Tibor",Resilience-throughput-power trade-off in futur...,PHOTONIC NETWORK COMMUNICATIONS,English,Article,5G; Resilience; Availability; Fixed-mobile con...,WIRELESS,5G New Radio allows operators to use new and w...,...,1387-974X,1572-8188,,10.1007/s11107-019-00842-2,WOS:000467630000004,Photonic Netw. Commun.,JUN,2019,"Computer Science, Information Systems; Optics;...",Computer Science; Optics; Telecommunications


Export the dataframe to a csv.

In [31]:
subdata.to_csv(Path(outputdir, "merged-wos_subcols.csv"), encoding = 'utf-8')

## 7. Create a random sample and export

In [20]:
rand1000 = subdata.sample(n = 1000)
rand1000.to_csv(Path(outputdir, "merged_wos_rand1000.csv"), encoding = 'utf-8')