In [1]:
import joblib
(df_WOS) = joblib.load("df_WOS.pkl" )
print( "Loaded %d X %d  WOS dataframe" % (len(df_WOS), len(df_WOS.columns) ))
(df_Scopus) = joblib.load("df_Scopus.pkl" )
print( "Loaded %d X %d  Scopus dataframe" % (len(df_Scopus), len(df_Scopus.columns) ))

Loaded 3811 X 10  WOS dataframe
Loaded 4129 X 9  Scopus dataframe


In [2]:
import pandas as pd
import string

## Processing

In [3]:
df = pd.concat([df_Scopus, df_WOS], sort = False)

### Df without dupli by doi
We start by matching WoS and Scopus publications based on the DOI as it is an unique identifier.

In [4]:
df['low_doi'] = df['doi'].str.lower()

In [5]:
df_wo_doi = df[df.doi.isnull()]
print('There are %d papers without doi' % len(df_wo_doi))

There are 753 papers without doi


In [6]:
df_with_doi = df[df.doi.notnull()]
df_without_dupli_doi = df_with_doi.groupby(['low_doi']).first().reset_index()
print('There are %d different papers with doi' % len(df_without_dupli_doi))

There are 4175 different papers with doi


In [7]:
df = pd.concat([df_wo_doi, df_without_dupli_doi], sort = False)
print('There are %d papers after removing duplicates based on doi' % len(df))

There are 4928 papers after removing duplicates based on doi


### Df without dupli by title
We transform the title by 
* using lower characters
* replacing punctuation by white space 
* replacing double white spaces by one white space  

In [8]:
string.punctuation =  string.punctuation + '’—☆–'
table = str.maketrans(string.punctuation, ' '*len(string.punctuation))
titles = df['title'].str.lower().tolist()
titles_wo_punct = [title.translate(table) for title in titles]
titles_wo_punct_one_white_space = [' '.join(title.split()) for title in titles_wo_punct]
df['prepro_title'] = pd.Series(titles_wo_punct_one_white_space,index = df.index)

We order the dataframe by 'low_title' and doi in order to keep papers with doi when there are duplicates (ex : published paper and conference paper)

In [9]:
df.sort_values(by = ['prepro_title','doi'], ascending = [True,True], inplace = True)

In [10]:
df = df.groupby(['prepro_title']).first().reset_index()
print('There are %d papers after removing duplicates based on low title' % len(df))

There are 4752 papers after removing duplicates based on low title


### Keep papers with abstract

In [11]:
df = df.loc[df.abstract.notnull()]
print('There are %d papers with abstract' % len(df))

There are 4740 papers with abstract


### Group by abstract

In [12]:
df.sort_values(by = ['abstract','doi'], ascending = [True,True], inplace = True)
df = df.groupby(['abstract']).first().reset_index()
print('There are %d papers after removing duplicates based on abstract' % (len(df)))

There are 4733 papers after removing duplicates based on abstract


### Group by beginning of abstract

In [13]:
abstracts = df['abstract'].str.lower().tolist()
abstracts_wo_punct = [abstract.translate(table) for abstract in abstracts]
abstracts_wo_punct_one_white_space = [' '.join(abstract.split()) for abstract in abstracts_wo_punct]
df['prepro_abs'] = pd.Series(abstracts_wo_punct_one_white_space, index = df.index)

In [14]:
df['beg_abs'] = df['prepro_abs'].str[:250]
df.sort_values(by = ['beg_abs','doi'], ascending = [True,True], inplace = True)
df = df.groupby(['beg_abs']).first().reset_index()

print('There are %d papers after removing duplicates based on first 250 characters of abstract' % (len(df)))

There are 4714 papers after removing duplicates based on first 250 characters of abstract


### Sorting dataframe

In [15]:
df.sort_values(by = ['publication_year','title'], ascending = [False,True], inplace = True)
df['d_type'] = df.docu_type.combine_first(df.doc_type)
col = ['title','authors','source','doi','d_type','abstract','publication_year','horizon_year','author_keywords','scopus_number','WOS_number','prepro_title']
df = df.reindex(columns=col)

In [16]:
print( "%d X %d dataframe" % (len(df), len(df.columns) ))

4714 X 12 dataframe


### Counting papers from Scopus and WOS

In [17]:
df_only_scop = df[df.scopus_number.notnull() & df.WOS_number.isnull()].copy()
print("%d papers only on Scopus" % len(df_only_scop) )

df_only_WOS = df[df.scopus_number.isnull() & df.WOS_number.notnull()].copy()
print("%d papers only on WOS" % len(df_only_WOS) )

df_both = df[df.scopus_number.notnull() & df.WOS_number.notnull()].copy()
print("%d papers on both Scopus and WOS" % len(df_both) )

932 papers only on Scopus
629 papers only on WOS
3153 papers on both Scopus and WOS


### Storing dataframe

In [18]:
df.to_excel("fusion_scop_wos.xlsx")

In [19]:
joblib.dump((df), "df_for_analysis.pkl") 

['df_for_analysis.pkl']