In [1]:
import joblib
(df_WOS) = joblib.load("df_WOS.pkl" )
print( "Loaded %d X %d  WOS dataframe" % (len(df_WOS), len(df_WOS.columns) ))
(df_Scopus) = joblib.load("df_Scopus.pkl" )
print( "Loaded %d X %d  Scopus dataframe" % (len(df_Scopus), len(df_Scopus.columns) ))

Loaded 3800 X 10  WOS dataframe
Loaded 4181 X 9  Scopus dataframe


In [2]:
import pandas as pd
import string
import re
import numpy as np

## Processing

In [3]:
df = pd.concat([df_Scopus, df_WOS], sort = False)

### Df without dupli by doi
We start by matching WoS and Scopus publications based on the DOI as it is an unique identifier.

In [4]:
df['low_doi'] = df['doi'].str.lower()

In [5]:
df_wo_doi = df[df.doi.isnull()]
print('There are %d papers without doi' % len(df_wo_doi))

There are 755 papers without doi


In [6]:
df_with_doi = df[df.doi.notnull()]
df_without_dupli_doi = df_with_doi.groupby(['low_doi']).first().reset_index()
print('There are %d different papers with doi' % len(df_without_dupli_doi))

There are 4189 different papers with doi


In [7]:
df = pd.concat([df_wo_doi, df_without_dupli_doi], sort = False)
print('There are %d papers after removing duplicates based on doi' % len(df))

There are 4944 papers after removing duplicates based on doi


### Df without dupli by title
We transform the title by 
* using lower characters
* deleting words in [] if it isn't the case for all the title
* (i) removing punctuation and replacing it be nothing
* (ii) replacing punctuation by one white space & then replacing double white spaces by one white space  

In [8]:
string.punctuation =  string.punctuation + '’—☆–−()()'
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~’—☆–−()()'

First, we remove all punctuation and group duplicates by title without punctuation

Example of duplicates identified : 
* The effect of Estonian electricity production scenarios on CO(2) and SO(2) emissions in 2000-2030
* The effect of Estonian electricity production scenarios on CO2 and SO2 emissions in 2000-2030

In [9]:
df['title_red'] = pd.Series([re.sub("[\[\]].*[\[\]]", "", title) for title in df.title.values],index = df.index)
df['title_red'] = np.where(df['title_red'].str.len()<=1,df['title'],df['title_red'])

In [10]:
table_remove_punct = str.maketrans('','', string.punctuation)

In [11]:
titles = df['title_red'].str.lower().tolist()
titles_wo_punct = [title.translate(table_remove_punct) for title in titles]
df['prepro_title_wo_punct'] = pd.Series(titles_wo_punct,index = df.index)

We order the dataframe by 'prepro_title_wo_punct' and doi in order to keep papers with doi when there are duplicates (ex : published paper and conference paper)

In [12]:
df.sort_values(by = ['prepro_title_wo_punct','doi'], ascending = [True,True], inplace = True)

In [13]:
df = df.groupby(['prepro_title_wo_punct']).first().reset_index()
print('There are %d papers after removing duplicates based on low title without punctuation' % len(df))

There are 4769 papers after removing duplicates based on low title without punctuation


Then we remove punctuation and replace it by on white space and then replace double white by one white space

Example of duplicates identified : 
* Future Japan Power Generation Sector by Introducing Hydrogen Plant with 80% CO2 Emission Reduction Target : A Preliminary Analysis
* Future Japan power generation sector by introducing hydrogen plant with 80% CO2 emission reduction target: A preliminary analysis

In [14]:
titles = df['title_red'].str.lower().tolist()
table_remove_punct_white_space = str.maketrans(string.punctuation, ' '*len(string.punctuation))

In [15]:
titles_wo_punct_white_space = [title.translate(table_remove_punct_white_space) for title in titles]
titles_wo_punct_one_white_space = [' '.join(title.split()) for title in titles_wo_punct_white_space]
df['prepro_title'] = pd.Series(titles_wo_punct_one_white_space,index = df.index)

In [16]:
df.sort_values(by = ['prepro_title','doi'], ascending = [True,True], inplace = True)

In [17]:
df = df.groupby(['prepro_title']).first().reset_index()
print('There are %d papers after removing duplicates based on low title with punctuation replaced by white space' % len(df))

There are 4755 papers after removing duplicates based on low title with punctuation replaced by white space


### Keep papers with abstract

In [18]:
df = df.loc[df.abstract.notnull()]
print('There are %d papers with abstract' % len(df))

There are 4743 papers with abstract


### Group by abstract

In [19]:
df.sort_values(by = ['abstract','doi'], ascending = [True,True], inplace = True)
df = df.groupby(['abstract']).first().reset_index()
print('There are %d papers after removing duplicates based on abstract' % (len(df)))

There are 4737 papers after removing duplicates based on abstract


### Group by beginning of abstract

In [20]:
abstracts = df['abstract'].str.lower().tolist()
abstracts_wo_punct = [abstract.translate(table_remove_punct_white_space) for abstract in abstracts]
abstracts_wo_punct_one_white_space = [' '.join(abstract.split()) for abstract in abstracts_wo_punct]
df['prepro_abs'] = pd.Series(abstracts_wo_punct_one_white_space, index = df.index)

In [21]:
df['beg_abs'] = df['prepro_abs'].str[:250]
df.sort_values(by = ['beg_abs','doi'], ascending = [True,True], inplace = True)
df = df.groupby(['beg_abs']).first().reset_index()

print('There are %d papers after removing duplicates based on first 250 characters of abstract' % (len(df)))

There are 4723 papers after removing duplicates based on first 250 characters of abstract


In [22]:
df['beg_abs_200'] = df['prepro_abs'].str[:200]

In [23]:
df_deb_first = df.loc[df.beg_abs_200.duplicated(keep='first')]
#df_deb_first.to_excel('deb.xlsx')
df_deb_first

Unnamed: 0,beg_abs,abstract,prepro_title,prepro_title_wo_punct,scopus_number,title,authors,source,doi,publication_year,author_keywords,docu_type,WOS_number,doc_type,email,low_doi,title_red,prepro_abs,beg_abs_200
2194,objective the aim of the present article is to...,Objective: The aim of the present article is t...,prospects of carbon capture and storage ccs in...,prospects of carbon capture and storage ccs in...,2-s2.0-84890938806,Prospects of carbon capture and storage (CCS) ...,"Viebahn P., Vallentin D., Höller S.",Applied Energy,10.1016/j.apenergy.2013.11.054,2014.0,CCS; India; Integrated assessment; Power sector,Article,WOS:000331675800006,J,,10.1016/j.apenergy.2013.11.054,Prospects of carbon capture and storage (CCS) ...,objective the aim of the present article is to...,objective the aim of the present article is to...
2294,policies to mitigate greenhouse gas ghg emissi...,Policies to mitigate greenhouse gas (GHG) emis...,co benefits of global and regional greenhouse ...,cobenefits of global and regional greenhouse g...,2-s2.0-85042813058,Co-benefits of global and regional greenhouse ...,"Zhang Y., Bowden J.H., Adelman Z., Naik V., Ho...",Atmospheric Chemistry and Physics Discussions,10.5194/acp-2015-1054,2016.0,,Article,,,,10.5194/acp-2015-1054,Co-benefits of global and regional greenhouse ...,policies to mitigate greenhouse gas ghg emissi...,policies to mitigate greenhouse gas ghg emissi...
3442,the potential role of photovoltaic technologie...,The potential role of photovoltaic technologie...,projected photovoltaic energy impacts on us co...,projected photovoltaic energy impacts on us co...,2-s2.0-0031189392,Projected photovoltaic energy impacts on US CO...,"Lee J.C., Fthenakis V.M., Morris S.C., Goldste...",Progress in Photovoltaics: Research and Applic...,10.1002/(sici)1099-159x(199707/08)5:4<277::aid...,1997.0,,Review,,,,10.1002/(sici)1099-159x(199707/08)5:4<277::aid...,Projected photovoltaic energy impacts on US CO...,the potential role of photovoltaic technologie...,the potential role of photovoltaic technologie...
3465,the present research study used the quantitati...,The present research study used the quantitati...,a 100 renewable scenario for venezuelan power ...,a 100 renewable scenario for venezuelan power ...,2-s2.0-85047432166,A 100% renewable scenario for Venezuelan Power...,Herman S.B.,CIGRE Session 45 - 45th International Conferen...,,2014.0,Energy planning; Green house gas emissions; Re...,Conference Paper,,,,,A 100% renewable scenario for Venezuelan Power...,the present research study used the quantitati...,the present research study used the quantitati...


In [24]:
df_deb_last = df.loc[df.beg_abs_200.duplicated(keep='last')]
#df_deb_last.to_excel('last.xlsx')
df_deb_last

Unnamed: 0,beg_abs,abstract,prepro_title,prepro_title_wo_punct,scopus_number,title,authors,source,doi,publication_year,author_keywords,docu_type,WOS_number,doc_type,email,low_doi,title_red,prepro_abs,beg_abs_200
2193,objective the aim of the present article is to...,Objective: The aim of the present article is t...,prospects of carbon capture and storage ccs in...,prospects of carbon capture and storage ccs in...,2-s2.0-84939800728,Prospects of carbon capture and storage (CCS) ...,"Viebahn P., Vallentin D., Höller S.",Applied Energy,10.1016/j.apenergy.2015.07.023,2015.0,CCS; China; CO2 storage potential; Integrated ...,Article,WOS:000364249200020,J,,10.1016/j.apenergy.2015.07.023,Prospects of carbon capture and storage (CCS) ...,objective the aim of the present article is to...,objective the aim of the present article is to...
2293,policies to mitigate greenhouse gas ghg emissi...,Policies to mitigate greenhouse gas (GHG) emis...,co benefits of global and regional greenhouse ...,cobenefits of global and regional greenhouse g...,2-s2.0-84980417625,Co-benefits of global and regional greenhouse ...,"Zhang Y., Bowden J.H., Adelman Z., Naik V., Ho...",Atmospheric Chemistry and Physics,10.5194/acp-16-9533-2016,2016.0,,Article,WOS:000382825200001,J,,10.5194/acp-16-9533-2016,Co-benefits of global and regional greenhouse ...,policies to mitigate greenhouse gas ghg emissi...,policies to mitigate greenhouse gas ghg emissi...
3441,the potential role of photovoltaic technologie...,The potential role of photovoltaic technologie...,projected photovoltaic energy impacts on us co...,projected photovoltaic energy impacts on us co...,,Projected photovoltaic energy impacts on US CO...,"Lee, JC; Fthenakis, VM; Morris, SC; Goldstein,...",PROGRESS IN PHOTOVOLTAICS,,1997.0,,,WOS:A1997XU72900007,J,,,Projected photovoltaic energy impacts on US CO...,the potential role of photovoltaic technologie...,the potential role of photovoltaic technologie...
3464,the present research study used the quantitati...,The present research study used the quantitati...,a sustainable scenario for venezuelan power ge...,a sustainable scenario for venezuelan power ge...,2-s2.0-84858277067,A sustainable scenario for Venezuelan power ge...,Bautista S.,Energy Policy,10.1016/j.enpol.2012.01.060,2012.0,Power generation; Renewable energy; Sustainabl...,Article,WOS:000302848700031,J,,10.1016/j.enpol.2012.01.060,A sustainable scenario for Venezuelan power ge...,the present research study used the quantitati...,the present research study used the quantitati...


In [25]:
df[3442,] = df.loc[3442,].combine_first(df.loc[3441,])

In [26]:
df.index

RangeIndex(start=0, stop=4723, step=1)

In [27]:
df = df.drop([3441,2294,3465])

In [28]:
df_deb_first = df.loc[df.beg_abs_200.duplicated(keep='first')]
df_deb_first

Unnamed: 0,beg_abs,abstract,prepro_title,prepro_title_wo_punct,scopus_number,title,authors,source,doi,publication_year,author_keywords,docu_type,WOS_number,doc_type,email,low_doi,title_red,prepro_abs,beg_abs_200,"(3442,)"
2194,objective the aim of the present article is to...,Objective: The aim of the present article is t...,prospects of carbon capture and storage ccs in...,prospects of carbon capture and storage ccs in...,2-s2.0-84890938806,Prospects of carbon capture and storage (CCS) ...,"Viebahn P., Vallentin D., Höller S.",Applied Energy,10.1016/j.apenergy.2013.11.054,2014.0,CCS; India; Integrated assessment; Power sector,Article,WOS:000331675800006,J,,10.1016/j.apenergy.2013.11.054,Prospects of carbon capture and storage (CCS) ...,objective the aim of the present article is to...,objective the aim of the present article is to...,


In [29]:
df_deb_last = df.loc[df.beg_abs_200.duplicated(keep='last')]
df_deb_last

Unnamed: 0,beg_abs,abstract,prepro_title,prepro_title_wo_punct,scopus_number,title,authors,source,doi,publication_year,author_keywords,docu_type,WOS_number,doc_type,email,low_doi,title_red,prepro_abs,beg_abs_200,"(3442,)"
2193,objective the aim of the present article is to...,Objective: The aim of the present article is t...,prospects of carbon capture and storage ccs in...,prospects of carbon capture and storage ccs in...,2-s2.0-84939800728,Prospects of carbon capture and storage (CCS) ...,"Viebahn P., Vallentin D., Höller S.",Applied Energy,10.1016/j.apenergy.2015.07.023,2015.0,CCS; China; CO2 storage potential; Integrated ...,Article,WOS:000364249200020,J,,10.1016/j.apenergy.2015.07.023,Prospects of carbon capture and storage (CCS) ...,objective the aim of the present article is to...,objective the aim of the present article is to...,


### Sorting dataframe

In [30]:
df.sort_values(by = ['publication_year','title'], ascending = [False,True], inplace = True)
df['d_type'] = df.docu_type.combine_first(df.doc_type)
col = ['title','authors','source','doi','d_type','abstract','publication_year','horizon_year','author_keywords','scopus_number','WOS_number','prepro_title']
df = df.reindex(columns=col)

In [31]:
print( "%d X %d dataframe" % (len(df), len(df.columns) ))

4720 X 12 dataframe


### Counting papers from Scopus and WOS

In [32]:
df_only_scop = df[df.scopus_number.notnull() & df.WOS_number.isnull()].copy()
print("%d papers only on Scopus" % len(df_only_scop) )

df_only_WOS = df[df.scopus_number.isnull() & df.WOS_number.notnull()].copy()
print("%d papers only on WOS" % len(df_only_WOS) )

df_both = df[df.scopus_number.notnull() & df.WOS_number.notnull()].copy()
print("%d papers on both Scopus and WOS" % len(df_both) )

951 papers only on Scopus
585 papers only on WOS
3184 papers on both Scopus and WOS


### Storing dataframe

In [33]:
df.to_excel("fusion_scop_wos.xlsx")

In [34]:
joblib.dump((df), "df_for_analysis.pkl") 

['df_for_analysis.pkl']