In [1]:
import pandas as pd 
import numpy as np
pd.options.display.max_columns = 500

### Functions to concatenate the batchs of maximum 500 metadata downloaded on WoS <br> (respectively in the *All databases* and and *Core collection*)
Argument *n* is the total number of downloaded batchs in each folder

In [2]:
def fusion_tot(n):
    mainpath= "./input/WOS/All_databases/savedrecs"
    frames =[]
    for k in range(n): 
        file = mainpath + str(k) +".txt"
        df = pd.read_table(file, sep = '\t', header = 0, index_col=False)
        frames.append(df)  
    Df = pd.concat(frames, sort=False)    
    return(Df)

In [3]:
def fusion_core(n):
    mainpath= "./input/WOS/Core_collection/savedrecs"
    frames =[]
    for k in range(n): 
        file = mainpath + str(k) +".txt"
        df = pd.read_table(file, sep = '\t', header = 0, index_col=False)
        frames.append(df)  
    Df = pd.concat(frames, sort=False)    
    return(Df)

### Read the database listing all WoS publications

In [4]:
df = fusion_tot(8)
print("%d X %d dataframe" % (len(df), len(df.columns) ))

3800 X 58 dataframe


### Add a column for author keywords in the database listing all WoS publications

In [5]:
df["DE"] = np.nan

### Eliminate duplicates based on WoS ID (*UT*)

In [6]:
df_key_null = df.groupby(['UT']).first().reset_index()
print("%d X %d dataframe" % (len(df_key_null), len(df_key_null.columns) ))

3800 X 59 dataframe


### Read the specific databases 

In [7]:
df1 = fusion_core(8)
print("%d X %d dataframe" % (len(df1), len(df1.columns) ))

3666 X 67 dataframe


In [8]:
df2 = pd.read_table("./input/WOS/Current_contents_connect/savedrecs0.txt", sep = '\t', header = 0, index_col=False)
df3 = pd.read_table("./input/WOS/KCI-Korean/savedrecs0.txt", sep = '\t', header = 0, index_col=False)
df4 = pd.read_table("./input/WOS/MEDLINE/savedrecs0.txt", sep = '\t', header = 0, index_col=False)
df5 = pd.read_table("./input/WOS/Russian_Science/savedrecs0.txt", sep = '\t', header = 0, index_col=False)
df6 = pd.read_table("./input/WOS/SciELO/savedrecs0.txt", sep = '\t', header = 0, index_col=False)

### Index all databases with the WoS ID and update the database listing all publications with information in specific databases

In [9]:
df.set_index("UT", inplace = True)
df1.set_index("UT", inplace = True)
df2.set_index("UT", inplace = True)
df3.set_index("UT", inplace = True)
df4.set_index("UT", inplace = True)
df5.set_index("UT", inplace = True)
df6.set_index("UT", inplace = True)

In [10]:
df.update(df1)
df.update(df2)
df.update(df3)
df.update(df4)
df.update(df5)
df.update(df6)

In [11]:
df.reset_index(0, inplace =True)

In [12]:
df.head(1)

Unnamed: 0,UT,PT,AU,BA,CA,GP,RI,OI,BE,Z2,TI,X1,Y1,Z1,FT,PN,AE,Z3,SO,S1,SE,BS,VL,IS,SI,MA,BP,EP,AR,DI,D2,EA,SU,PD,PY,AB,X4,Y4,Z4,AK,CT,CY,SP,CL,TC,Z8,ZR,ZA,ZB,ZS,Z9,U1,U2,SN,EI,BN,PM,Unnamed: 57,DE
0,MEDLINE:32843822,J,"Brodny, Jaroslaw; Tutak, Magdalena",,,,,,,,The analysis of similarities between the Europ...,,,,,,,,Journal of cleaner production,,,,279,,,,123641,123641,,10.1016/j.jclepro.2020.123641,,,,2021 Jan 10 (Epub 2020 Aug 13),2021.0,© 2020 Elsevier Ltd. All rights reserved.Based...,,,,,,,,,0.0,0,0,0.0,0,0,0.0,36.0,36.0,0959-6526,,,32843822.0,,


### Rename and select columns of interest

In [13]:
df.rename(columns ={'UT': 'WOS_number',
                    'TI': 'title',
                    'PT': 'doc_type',
                    'AU': 'authors',
                    'SO': 'source',
                    'DI': 'doi',
                    'PY': 'publication_year',
                    'AB': 'abstract',
                    'DE': 'author_keywords'}, inplace=True)

In [14]:
col=['WOS_number','doc_type','authors','title','source','doi','publication_year','abstract','author_keywords','email']
df = df.reindex(columns=col)
df.head()

Unnamed: 0,WOS_number,doc_type,authors,title,source,doi,publication_year,abstract,author_keywords,email
0,MEDLINE:32843822,J,"Brodny, Jaroslaw; Tutak, Magdalena",The analysis of similarities between the Europ...,Journal of cleaner production,10.1016/j.jclepro.2020.123641,2021.0,© 2020 Elsevier Ltd. All rights reserved.Based...,,
1,MEDLINE:32798870,J,"Mi, Chenxi; Shatwell, Tom; Ma, Jun; Xu, Yaqian...",Ensemble warming projections in Germany's larg...,The Science of the total environment,10.1016/j.scitotenv.2020.141366,2020.0,Copyright © 2020 The Authors. Published by Els...,,
2,MEDLINE:32900543,J,"Gomez-Sanabria, Adriana; Zusman, Eric; Hoglund...",Sustainable wastewater management in Indonesia...,Journal of environmental management,10.1016/j.jenvman.2020.111241,2020.0,Copyright © 2020 The Authors. Published by Els...,,
3,MEDLINE:33027737,J,"Kuylenstierna, Johan C I; Heaps, Charles G; Ah...",Development of the Low Emissions Analysis Plat...,Environment international,10.1016/j.envint.2020.106155,2020.0,Copyright © 2020 The Authors. Published by Els...,,
4,WOS:000571967500005,J,"van der Salm, C; Voogt, W; Beerling, E; van Ru...",Minimising emissions to water bodies from NW E...,AGRICULTURAL WATER MANAGEMENT,10.1016/j.agwat.2020.106398,2020.0,In large parts of the Netherlands surface wate...,drainage; DSS model; fertigation; hydroponics;...,


In [15]:
print("%d X %d dataframe" % (len(df), len(df.columns) ))

3800 X 10 dataframe


In [16]:
df.to_csv("./intermed/df_WOS.csv",index=False) 