In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None
import numpy as np
import joblib
import string

In [2]:
(df) = joblib.load("2_Fusion/df_for_analysis.pkl" )
print( "Loaded %d X %d dataframe" % (len(df), len(df.columns) ))

Loaded 4714 X 12 dataframe


In [3]:
df['prepro_title'] = " " + df['prepro_title'] + " "

### Creating new dataframe containing one row for each country mentionned per paper

In [4]:
UNSD_Path = "2_Fusion/1_Extraction/UNSD_database.xlsx"
UNSD_df = pd.read_excel(UNSD_Path, encoding='utf-8')
print( "Loaded %d X %d dataframe" % (len(UNSD_df), len(UNSD_df.columns) ))

Loaded 262 X 19 dataframe


In [5]:
UNSD_df.head(1)

Unnamed: 0,Global Code,Global Name,Region Code,Region_Name,Sub-region Code,Sub-region_Name,Intermediate Region Code,Intermediate_Region_Name,Country0,Demonym1,Demonym2,M49 Code,ISO_3,Least Developed Countries (LDC),Land Locked Developing Countries (LLDC),Small Island Developing States (SIDS),Developed_Developing_Countries,Region,Country
0,1.0,World,2.0,Africa,15.0,Northern Africa,,,Algeria,Algerian,,12,DZA,,,,Developing,Africa,Algeria


We create a new dataframe where we concatenate all the sub-dataframes corresponding to each country

In [6]:
df_country = pd.DataFrame()

In [7]:
for index, row in UNSD_df.iterrows():
    country = str(row['Country0'])
    country_low = ' ' + country.lower() + ' '
    demon1 = str(row['Demonym1'])
    demon1_low = ' ' + demon1.lower() + ' '
    demon2 = str(row['Demonym2'])
    demon2_low = ' ' + demon2.lower() + ' '
    if (demon1 == "nan") & (demon2 == "nan"):
        df_tempo = df[df['prepro_title'].str.contains(country_low)].copy()
        df_tempo['Country0'] = country
        df_country = df_country.append(df_tempo)
    elif (demon1 != "nan") & (demon2 == "nan"):
        df_tempo = df[df['prepro_title'].str.contains(country_low) | df['prepro_title'].str.contains(demon1_low)].copy()
        df_tempo['Country0'] = country
        df_country = df_country.append(df_tempo)
    else:
        df_tempo = df[df['prepro_title'].str.contains(country_low) | df['prepro_title'].str.contains(demon1_low)| df['prepro_title'].str.contains(demon2_low)].copy()
        df_tempo['Country0'] = country
        df_country = df_country.append(df_tempo)        
print("%d X %d dataframe with one line for each country search" % (len(df_country), len(df_country.columns) ))

4923 X 13 dataframe with one line for each country search


### Create region & ISO-3 columns

In [8]:
df_country = pd.merge(df_country,UNSD_df[['Country0','ISO_3','Region']],on='Country0')

### Remove duplicates according ISO-3 code 

In [9]:
df_country = df_country.drop_duplicates(['ISO_3','title'], keep='first')
print("%d X %d dataframe" % (len(df_country), len(df_country.columns) ))

4905 X 15 dataframe


### Select only rows that contains mitigation synonym in title, abstract or AUTHOR keywords

In [10]:
df_country['low_abstract'] = df_country['abstract'].str.lower()
df_country['low_author_keywords'] = df_country['author_keywords'].str.lower()

In [11]:
mitig_list = ["mitigation","carbon","co2", "ghg","greenhouse gas","emission"]

In [12]:
df_verif = pd.DataFrame()

In [13]:
for mitig in mitig_list:
    mitig = str(mitig)
    df_tempo = df_country[df_country['prepro_title'].str.contains(mitig) | df_country['low_abstract'].str.contains(mitig) | df_country['low_author_keywords'].str.contains(mitig, na = False)].copy()
    df_verif = df_verif.append(df_tempo)
print("%d X %d dataframe with mitigation synonym inside each row" % (len(df_verif), len(df_verif.columns) ))

12528 X 17 dataframe with mitigation synonym inside each row


In [14]:
df_verif = df_verif.drop_duplicates(['ISO_3','title'], keep='first')
print("%d X %d verified dataframe" % (len(df_verif), len(df_verif.columns) ))

4721 X 17 verified dataframe


### Creating horizon year

In [15]:
df_verif.reset_index(0, inplace = True)
df_verif['horizon_year'] = np.nan

In [16]:
year_liste = []
for k in range (2025, 2101):
    year_liste.append(k)

In [17]:
for year in year_liste:
    year = str(year)
    presence_title = df_verif.title.str.contains(year, regex = False, na = False)
    presence_abstract = df_verif.abstract.str.contains(year, regex = False, na = False)
    presence_authorkeywords = df_verif.author_keywords.str.contains(year, regex = False, na = False)
    for k in range(len(df_verif)):
        if ((presence_title[k]== True) or (presence_abstract[k]== True) or (presence_authorkeywords[k]== True)) :
            df_verif.loc[k,'horizon_year'] = year

### Select only rows that contains a horizon year in [2025;2100] in title, abstract or AUTHOR keywords

In [18]:
df_verif = df_verif[df_verif.horizon_year.notnull()]
print( "%d X %d dataframe with horizon_year" % (len(df_verif), len(df_verif.columns) ))

4715 X 18 dataframe with horizon_year


#### Associate to ISO3 one only name (USA, United States, US, U.S. -> Country = United States of America)

In [19]:
UNSD_unique = UNSD_df.groupby('ISO_3').first().reset_index()
df_verif = pd.merge(df_verif, UNSD_unique[['ISO_3','Country']],on='ISO_3')

#### Check for US papers : ambiguity between country and personal pronoun 'us'

In [20]:
check = df_verif[(df_verif['prepro_title'].str.contains(' us ')) & 
                 (~df_verif['prepro_title'].str.contains(' the us ')) & 
                 (~df_verif['title'].str.contains('US')) & 
                 (df_verif['Country0']=="US")]

In [21]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'max_colwidth', -1):
    print (check['title'])

671    What do global climate models tell us about future arctic sea ice coverage changes?                                                                                                 
675    Current fossil fuel infrastructure does not yet commit us to 1.5 °C warming                                                                                                         
696    Limiting global warming to 2 °C: What do the latest mitigation studies tell us about costs, technologies and other impacts?                                                         
703    What are incident reports telling us? A comparative study at two Australian hospitals of medication errors identified at audit, detected by staff and reported to an incident system
813    Hybrids are an effective transitional technology for limiting us passenger fleet carbon emissions                                                                                   
832    How negative can biofuels with CCS take us and at wha

In [22]:
df_verif = df_verif.drop([571,592,599,713,827,830])
print("%d X %d dataframe after selection" % (len(df_verif), len(df_verif.columns) ))

4709 X 19 dataframe after selection


#### Remove the attribution of papers containing the term "British Colombia" to the UK

In [23]:
df_verif = df_verif.drop(df_verif[(df_verif['ISO_3'].str.contains("GBR")) & (df_verif.title.str.contains("British Columbia"))].index)

### Order the dataframe

In [24]:
df_multi = df_verif.copy()
print("%d X %d dataframe with one line for each country" % (len(df_multi), len(df_multi.columns) ))

4696 X 19 dataframe with one line for each country


In [25]:
col = ['ISO_3','Country','Region','title','authors','source','doi','doc_type','abstract','author_keywords','publication_year','horizon_year','scopus_number','WOS_number']
df_multi = df_multi.reindex(columns=col)
df_multi.sort_values(by = ['ISO_3','Country','publication_year','title','doi'], ascending = [True,True,False,True,True], inplace = True)
print( "Loaded %d X %d multi-rows dataframe" % (len(df_multi), len(df_multi.columns) ))

Loaded 4696 X 14 multi-rows dataframe


In [26]:
df_multi.to_excel('database_multi_rows_each_paper_low_str.xlsx')

### Database one row each paper

In [27]:
df_one = df_multi.groupby(['title']).first().reset_index()

In [28]:
col = ['ISO_3','Country','Region','title','authors','source','doi','doc_type','abstract','author_keywords','publication_year','horizon_year','scopus_number','WOS_number']
df_one = df_one.reindex(columns=col)
df_one.sort_values(by = ['ISO_3','Country','publication_year','title','doi'], ascending = [True,True,False,True,True], inplace = True)
print( "Loaded %d X %d one row each paper dataframe" % (len(df_one), len(df_one.columns) ))

Loaded 4516 X 14 one row each paper dataframe


In [29]:
df_one.to_excel('database_one_row_each_paper_low_str.xlsx')

#### Count papers with DOI

In [30]:
df_doi = df_one.loc[df_one.doi.notnull()]
print( "Loaded %d X %d dataframe with doi" % (len(df_doi), len(df_doi.columns) ))

Loaded 3950 X 14 dataframe with doi


#### Count papers with word "scenario"

In [31]:
df_scenario = df_one[df_one['title'].str.contains("scenario") | df_one['abstract'].str.contains("scenario")| df_one['author_keywords'].str.contains("scenario")].copy()
print( "Loaded %d publications with 'scenario' " % (len(df_scenario)))

Loaded 2306 publications with 'scenario' 


#### Count papers without author keywords

In [32]:
df_keys_nul = df_one[df_one.author_keywords.isnull()]
print(len(df_keys_nul))

796


### Storing dataframe multi papers and dataframe unique paper

In [33]:
joblib.dump((df_multi, df_one), "2df_countries_title.pkl")

['2df_countries_title.pkl']

### Counting papers from Scopus and WOS

In [34]:
df_only_scop = df_one[df_one.scopus_number.notnull() & df_one.WOS_number.isnull()]
print("%d papers only on Scopus" % len(df_only_scop) )

df_only_WOS = df_one[df_one.scopus_number.isnull() & df_one.WOS_number.notnull()]
print("%d papers only on WOS" % len(df_only_WOS) )

df_both = df_one[df_one.scopus_number.notnull() & df_one.WOS_number.notnull()]
print("%d papers on both Scopus and WOS" % len(df_both) )

925 papers only on Scopus
452 papers only on WOS
3139 papers on both Scopus and WOS


### Storing dataframe with papers mentionning no country

In [35]:
df_all = df.merge(df_one, on=['title'], how='left', indicator=True)
df_test = df_all[~df_all._merge.isin(['both'])]
df_test.to_excel('rejected.xlsx')