In [1]:
import joblib
import pandas as pd
import string
import re
import numpy as np

### Download databases from WoS and Scopus

In [2]:
df_WOS = pd.read_csv("./intermed/df_WOS.csv")
print( "Loaded %d X %d  WOS dataframe" % (len(df_WOS), len(df_WOS.columns) ))
df_Scopus = pd.read_csv("./intermed/df_Scopus.csv")
print( "Loaded %d X %d  Scopus dataframe" % (len(df_Scopus), len(df_Scopus.columns) ))

Loaded 3800 X 10  WOS dataframe
Loaded 4181 X 9  Scopus dataframe


### Concatenate the databases

In [3]:
df = pd.concat([df_Scopus, df_WOS], sort = False)

### Remove duplicates based on the DOI
Write DOI in lower cases

In [4]:
df['low_doi'] = df['doi'].str.lower()

In [5]:
df_wo_doi = df[df.doi.isnull()]
print('There are %d papers without doi' % len(df_wo_doi))

There are 755 papers without doi


In [6]:
df_with_doi = df[df.doi.notnull()]
df_without_dupli_doi = df_with_doi.groupby(['low_doi']).first().reset_index()
print('There are %d different papers with doi' % len(df_without_dupli_doi))

There are 4189 different papers with doi


In [7]:
df = pd.concat([df_wo_doi, df_without_dupli_doi], sort = False)
print('There are %d papers after removing duplicates based on doi' % len(df))

There are 4944 papers after removing duplicates based on doi


### Remove duplicates based on the title
We transform the title by 
* using lower characters
* deleting words in [ ] if it isn't the case for all the title
* (i) removing punctuation and replacing it be nothing
* (ii) replacing punctuation by one white space & then replacing double white spaces by one white space  

In [8]:
string.punctuation =  string.punctuation + '’—☆–−()()©“”‘'
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~’—☆–−()()©“”‘'

First, we remove all punctuation and group duplicates by title without punctuation

Example of duplicates identified : 
* The effect of Estonian electricity production scenarios on CO(2) and SO(2) emissions in 2000-2030
* The effect of Estonian electricity production scenarios on CO2 and SO2 emissions in 2000-2030

In [9]:
df['title_red'] = pd.Series([re.sub("[\[\]].*[\[\]]", "", title) for title in df.title.values],index = df.index)
df['title_red'] = np.where(df['title_red'].str.len()<=1,df['title'],df['title_red'])

In [10]:
table_remove_punct = str.maketrans('','', string.punctuation)

In [11]:
titles = df['title_red'].str.lower().tolist()
titles_wo_punct = [title.translate(table_remove_punct) for title in titles]
df['prepro_title_wo_punct'] = pd.Series(titles_wo_punct,index = df.index)

We order the dataframe by 'prepro_title_wo_punct' and doi in order to keep papers with doi when there are duplicates (ex : published paper and conference paper)

In [12]:
df.sort_values(by = ['prepro_title_wo_punct','doi'], ascending = [True,True], inplace = True)

In [13]:
df = df.groupby(['prepro_title_wo_punct']).first().reset_index()
print('There are %d papers after removing duplicates based on low title without punctuation' % len(df))

There are 4769 papers after removing duplicates based on low title without punctuation


Then we remove punctuation and replace it by on white space and then replace double white by one white space

Example of duplicates identified : 
* Future Japan Power Generation Sector by Introducing Hydrogen Plant with 80% CO2 Emission Reduction Target : A Preliminary Analysis
* Future Japan power generation sector by introducing hydrogen plant with 80% CO2 emission reduction target: A preliminary analysis

In [14]:
titles = df['title_red'].str.lower().tolist()
table_remove_punct_white_space = str.maketrans(string.punctuation, ' '*len(string.punctuation))

In [15]:
titles_wo_punct_white_space = [title.translate(table_remove_punct_white_space) for title in titles]
titles_wo_punct_one_white_space = [' '.join(title.split()) for title in titles_wo_punct_white_space]
df['prepro_title'] = pd.Series(titles_wo_punct_one_white_space,index = df.index)

In [16]:
df.sort_values(by = ['prepro_title','doi'], ascending = [True,True], inplace = True)

In [17]:
df = df.groupby(['prepro_title']).first().reset_index()
print('There are %d papers after removing duplicates based on low title with punctuation replaced by white space' % len(df))

There are 4755 papers after removing duplicates based on low title with punctuation replaced by white space


### Keep papers with abstract

In [18]:
df = df.loc[df.abstract.notnull()]
print('There are %d papers with abstract' % len(df))

There are 4743 papers with abstract


### Remove duplicates based on the abstract

In [19]:
df.sort_values(by = ['abstract','doi'], ascending = [True,True], inplace = True)
df = df.groupby(['abstract']).first().reset_index()
print('There are %d papers after removing duplicates based on abstract' % (len(df)))

There are 4737 papers after removing duplicates based on abstract


### Remove duplicates based on the beginning of the abstract

In [20]:
abstracts = df['abstract'].str.lower().tolist()
abstracts_wo_punct = [abstract.translate(table_remove_punct_white_space) for abstract in abstracts]
abstracts_wo_punct_one_white_space = [' '.join(abstract.split()) for abstract in abstracts_wo_punct]
df['prepro_abs'] = pd.Series(abstracts_wo_punct_one_white_space, index = df.index)

In [21]:
df['beg_abs'] = df['prepro_abs'].str[:250]
df.sort_values(by = ['beg_abs','doi'], ascending = [True,True], inplace = True)
df = df.groupby(['beg_abs']).first().reset_index()

print('There are %d papers after removing duplicates based on first 250 characters of abstract' % (len(df)))

There are 4723 papers after removing duplicates based on first 250 characters of abstract


#### Look more closely at papers having the same first 200 characters

In [22]:
df['beg_abs_200'] = df['prepro_abs'].str[:200]

In [23]:
df_deb_first = df.loc[df.beg_abs_200.duplicated(keep='first')]
df_deb_first[['title','doi','publication_year','docu_type','doc_type','scopus_number','WOS_number']]

Unnamed: 0,title,doi,publication_year,docu_type,doc_type,scopus_number,WOS_number
2194,Prospects of carbon capture and storage (CCS) ...,10.1016/j.apenergy.2013.11.054,2014.0,Article,J,2-s2.0-84890938806,WOS:000331675800006
2294,Co-benefits of global and regional greenhouse ...,10.5194/acp-2015-1054,2016.0,Article,,2-s2.0-85042813058,
3445,Projected photovoltaic energy impacts on US CO...,10.1002/(sici)1099-159x(199707/08)5:4<277::aid...,1997.0,Review,,2-s2.0-0031189392,
3468,A 100% renewable scenario for Venezuelan Power...,,2014.0,Conference Paper,,2-s2.0-85047432166,


In [24]:
df_deb_last = df.loc[df.beg_abs_200.duplicated(keep='last')]
df_deb_last[['title','doi','publication_year','docu_type','doc_type','scopus_number','WOS_number']]

Unnamed: 0,title,doi,publication_year,docu_type,doc_type,scopus_number,WOS_number
2193,Prospects of carbon capture and storage (CCS) ...,10.1016/j.apenergy.2015.07.023,2015.0,Article,J,2-s2.0-84939800728,WOS:000364249200020
2293,Co-benefits of global and regional greenhouse ...,10.5194/acp-16-9533-2016,2016.0,Article,J,2-s2.0-84980417625,WOS:000382825200001
3444,Projected photovoltaic energy impacts on US CO...,,1997.0,,J,,WOS:A1997XU72900007
3467,A sustainable scenario for Venezuelan power ge...,10.1016/j.enpol.2012.01.060,2012.0,Article,J,2-s2.0-84858277067,WOS:000302848700031


* 2194 and 2193 have the same abstracts but relate respectively to China and India so we keep both. 
* We merge 3445 and 3444
* 3468 is the conference paper for 3467 so we keep the published version
* 2294 is the preversion of 2293, so we keep the last version

In [25]:
df[3445,] = df.loc[3445,].combine_first(df.loc[3444,])

In [26]:
df = df.drop([3444,2294,3468])

### Sort the dataframe

In [27]:
df.sort_values(by = ['publication_year','title'], ascending = [False,True], inplace = True)
df['d_type'] = df.docu_type.combine_first(df.doc_type)
col = ['title','authors','source','doi','d_type','abstract','publication_year','horizon_year','author_keywords','scopus_number','WOS_number','prepro_title']
df = df.reindex(columns=col)

In [28]:
print( "%d X %d dataframe" % (len(df), len(df.columns) ))

4720 X 12 dataframe


### Store the dataframe

In [29]:
df.to_csv("./output/df_for_analysis.csv", index = False) 