In [1]:
import pandas as pd 
import numpy as np 
import joblib
import string
from stemming.porter2 import stem
from nltk.corpus import stopwords
import operator

### Download the database with one row for each publication

In [2]:
df = pd.read_csv("../2_Treatment_database/output/database_one_row_each_paper.csv")
print( "Loaded %d X %d dataframe with unique papers" % (len(df), len(df.columns) ))

Loaded 4691 X 16 dataframe with unique papers


### Preprocess the abstracts

Transform abstracts in lower cases

In [3]:
abstracts = df['abstract'].str.lower().to_list()

Remove punctuation and replace it by on white space and then replace double white by one white space

In [4]:
string.punctuation =  string.punctuation + '’—☆–−()()©“”‘'
table_remove_punct_white_space = str.maketrans(string.punctuation, ' '*len(string.punctuation))
abstracts = [abstract.translate(table_remove_punct_white_space) for abstract in abstracts]
abstracts = [' '.join(abstract.split()) for abstract in abstracts]

Define stopwords 
* nltk stopwords
* countries and demonyms 

In [5]:
nltk_stopwords = list(set(stopwords.words('english')))

In [6]:
countriesPath = "../0_Reference_files/UNSD_database.csv"
countries_df = pd.read_csv(countriesPath)
countries_list = countries_df['Country0'].tolist()
demonym1_list = countries_df['Demonym1'][countries_df['Demonym1'].notnull()].tolist()
demonym2_list = countries_df['Demonym2'][countries_df['Demonym2'].notnull()].tolist()
names_list = countries_list + demonym1_list + demonym2_list

In [7]:
names_list = [str(name).lower().translate(table_remove_punct_white_space) for name in names_list]
names_list = [' '.join(name.split()) for name in names_list]

Remove names that are not single word

In [8]:
not_single = [name for name in names_list
                     if (len(name)>3 & len(name.split())>1)]

In [9]:
l = [''] *len(not_single)

In [10]:
trans = {name:delet for name,delet in zip(not_single,l)}

In [11]:
for a, b in trans.items():
    for i in range(len(abstracts)):
        abstracts[i] = abstracts[i].replace(a, b)

In [12]:
custom_stopwords = nltk_stopwords + names_list

Remove stopwords and group words by stem :

In [13]:
abstracts = [" ".join([stem(word) for word in abstract.split(" ") 
                       if ((word not in custom_stopwords) & (len(word)>2))]) for abstract in abstracts]

Remove stem stopwords 
* synonyms of mitigation
* that appeared useless after topic modeling experiences:

In [14]:
stem_stopwords = ['mitig','carbon','decarbonis','decarbon','co2','ghg','greenhous','gas','emiss','reduc','reduct']
stem_stopwords += ['million','ton','billion','proceed','paper','data','present','result','studi','describ','also','base','explor','analys','analyz','elsevi']

In [15]:
abstracts = [" ".join([stem for stem in abstract.split(" ") if stem not in stem_stopwords]) for abstract in abstracts]

Remove digits

In [16]:
abstracts =[" ".join([stem for stem in abstract.split(" ") if not stem.isdigit()]) for abstract in abstracts]

Save processed abstracts

In [17]:
abstract_prepro = pd.DataFrame(abstracts, columns = ['abstracts_prepro'])

In [18]:
abstract_prepro.to_csv("./interm/processed_abstracts.csv",index=False)

### Term weighting with TF-IDF

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=0.01,smooth_idf= False)
                                   
tfidf = tfidf_vectorizer.fit_transform(abstracts)
print('Created %d X %d TF-IDF-normalized document-term matrix' % (tfidf.shape[0], tfidf.shape[1]))

Created 4691 X 1300 TF-IDF-normalized document-term matrix


In [20]:
# extract the resulting vocabulary
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print("Vocabulary has %d distinct terms" % len(tfidf_feature_names))

Vocabulary has 1300 distinct terms


Save TF-IDF matrix and vocabulary features names under pkl format

In [21]:
joblib.dump((tfidf, tfidf_feature_names), "./interm/tfidf_matrix-features_names.pkl") 

['./interm/tfidf_matrix-features_names.pkl']

### Look at weightest words

In [22]:
def rank_terms(tfidf, terms):
    # get the sums over each column
    sums = tfidf.sum(axis=0)
    # map weights to the terms
    weights = {}
    for col, term in enumerate(terms):
        weights[term] = sums[0,col]
    # rank the terms by their weight over all documents
    return sorted(weights.items(), key = operator.itemgetter(1), reverse=True)

In [23]:
ranking = rank_terms(tfidf,tfidf_feature_names)
for i, pair in enumerate(ranking[0:20]):
    print( "%02d. %s (%.2f)" % ( i+1, pair[0], pair[1] ) )

01. energi (398.26)
02. scenario (216.68)
03. climat (196.51)
04. chang (177.69)
05. electr (174.56)
06. model (174.07)
07. use (172.87)
08. polici (168.62)
09. power (168.03)
10. sector (161.10)
11. develop (153.57)
12. technolog (146.68)
13. increas (145.72)
14. cost (143.19)
15. system (133.95)
16. target (127.75)
17. industri (124.57)
18. renew (123.35)
19. futur (115.95)
20. generat (115.86)
