## Section 1
### First we use word lists from psychometric tools and obtain synonyms and hyponyms using WordNet

In [None]:
## The first time you use this script, uncomment lines 3 and 5

!pip install nltk
import nltk
nltk.download('all')
from nltk.corpus import wordnet as wn

# Here we can change the langauge of analysis
languages = ['eng']


[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_ru.zip.
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping grammars/basque_grammars.zip.
[nltk_data]    | Downloading package biocreative_ppi to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping corpora/biocreative_ppi.zip.
[nltk_data]    | Downloadin

In [None]:
### Function to Generate synonyms and hyponyms using Wordnet

def generate_wordnet_list(word_base,languages,exclude_list):
  word_lists = []

  #iterate over languages
  for language in languages:
    word_list = []

    # iterate over words in the word_base list
    for word in word_base:

      ## iterate over different meanings of the word synsets
      for meaning in wn.synsets(word, pos=wn.NOUN+wn.VERB+wn.ADJ):
        if meaning.name() not in exclude_list:
          print(meaning)
          print(meaning.definition())
          print('\n')


          ## iterate over different synonyms and add them to list
          for synonym in meaning.lemmas(language):
            #print(synonym)

            ## if the synonym is no in the list add it
            if synonym.name() not in word_list:
              word_list.append(synonym.name())
              #print(synonym.name())

          ## iterate over hyponyms
          for hyponym in meaning.hyponyms():
            #print(hyponym.lemmas(language))

            ## iterate over synonyms of hyponyms
            for synonym_of_hyponym in hyponym.lemmas(language):
              #print(synonym_of_hyponym.name())

              ## if the synonym of the hyponym is no in the list add it
              if synonym_of_hyponym.name() not in word_list:
                word_list.append(synonym_of_hyponym.name())

    ## add the language-specific word list to the global word list, sort the words in alphabetic order   
    word_lists +=[sorted(word_list)]

  ## there are 3 lists now, one for each language, we iterate over the list of lists and print each
  for lst in word_lists:
    #print(lst)
    return(lst)

In [None]:
## Example of seed-words related to a target concept - for instance, biodiversity. 
## In this case, I obtained based seed word list from:
# Sterett, S. M. (2018). Climate change adaptation: Existential threat, welfare states and legal management. Oñati Socio-Legal Series, Forthcoming.
word_base_biodiversity = ['biodiversity', 'ecosystem','habitat','conservation', 'environment', 'species','natural']
## Using the function above, we can obtain the list of synonyms and hyponyms of the seed words,
## We can also exclude word meanings that are irrelvant for the concept we want to measure

## For instance, 'quilt' is not related to biodiversity, and we can add it to the exclusion list
## We run the script until all the words in the list of synsets displayed below have relevant meanings 

exclude_list_biodiversity = ['coinage.n.01','natural.n.01','natural.n.02','natural.n.03','natural.a.05','natural.s.08',]
biodiversity_list = generate_wordnet_list(word_base_biodiversity,languages,exclude_list_biodiversity)


Synset('biodiversity.n.01')
the diversity of plant and animal life in a particular habitat (or in the world as a whole)


Synset('ecosystem.n.01')
a system formed by the interaction of a community of organisms with their physical environment


Synset('habitat.n.01')
the type of environment in which an organism or group normally lives or occurs


Synset('conservation.n.01')
an occurrence of improvement by virtue of preventing loss or injury or other change


Synset('conservation.n.02')
the preservation and careful management of the environment and of natural resources


Synset('conservation.n.03')
(physics) the maintenance of a certain quantities unchanged during chemical reactions or physical transformations


Synset('environment.n.01')
the totality of surrounding conditions


Synset('environment.n.02')
the area in which something exists or lives


Synset('species.n.01')
(biology) taxonomic group whose members can interbreed


Synset('species.n.02')
a specific kind of something


Synse

In [None]:
## This is a very broad list which includes all possible synonyms and hyponyms 
## related to the seed words we chose for climate

## As one can verify, this list needs prunning 
## some of it by excluding irrelevant synsets a priori (cell above), 
## but some of it can also be done manually a posteriori

print(biodiversity_list)

['ambiance', 'ambience', 'area', 'arena', 'background', 'bacteria_species', 'biodiversity', 'born', 'circumstance', 'conservancy', 'conservation', 'conservation_of_charge', 'conservation_of_electricity', 'conservation_of_energy', 'conservation_of_mass', 'conservation_of_matter', 'conservation_of_momentum', 'conservation_of_parity', 'context', 'domain', 'ecology', 'ecosystem', 'element', 'endangered_species', 'environment', 'environs', 'field', 'first_law_of_thermodynamics', 'fish_species', 'habitat', 'habitation', 'home', 'home_ground', 'innate', 'instinctive', 'law_of_conservation_of_energy', 'law_of_conservation_of_mass', 'law_of_conservation_of_matter', 'lifelike', 'medium', 'melting_pot', 'milieu', 'mirror_symmetry', 'natural', 'oil_conservation', 'orbit', 'parity', 'parts', 'preservation', 'raw', 'rude', 'scene', 'scope', 'setting', 'soil_conservation', 'space-reflection_symmetry', 'species', 'sphere', 'street', 'surround', 'surroundings', 'type_species', 'water_conservation']


In [None]:
# LOSS

#Sterett, S. M. (2018). Climate change adaptation: Existential threat, welfare states and legal management. Oñati Socio-Legal Series, Forthcoming.
word_base_loss = ['loss','threat','damage','risk','decline']
   

exclude_list_loss = ['damage.n.02','price.n.02','price.n.02','loss.n.05','loss.n.06','personnel_casualty.n.01','risk.n.02','refuse.v.02','refuse.v.01','decline.v.07']
loss_list = generate_wordnet_list(word_base_loss,languages,exclude_list_loss)


Synset('loss.n.01')
something that is lost


Synset('loss.n.02')
gradual decline in amount or activity


Synset('loss.n.03')
the act of losing someone or something


Synset('loss.n.04')
the disadvantage that results from losing something


Synset('passing.n.02')
euphemistic expressions for death


Synset('menace.n.01')
something that is a source of danger


Synset('threat.n.02')


Synset('threat.n.03')
declaration of an intention or a determination to inflict harm on another


Synset('terror.n.02')
a person who inspires fear or dread


Synset('damage.n.01')
the occurrence of a change for the worse


Synset('damage.n.03')
the act of damaging something or someone


Synset('wrong.n.02')
any harm or injury resulting from a violation of a legal right


Synset('damage.v.01')
inflict damage upon


Synset('damage.v.02')
suffer or be susceptible to damage


Synset('hazard.n.01')
a source of danger; a possibility of incurring loss or misfortune


Synset('risk.n.03')
the probability of becoming i

In [None]:
print(loss_list)

['adventure', 'afflict', 'bang_up', 'bell_the_cat', 'bilge', 'blemish', 'break', 'bruise', 'burn', 'capitulation', 'chance', 'come_down', 'commination', 'correct', 'corrode', 'cut_up', 'damage', 'decay', 'declension', 'declination', 'decline', 'decline_in_quality', 'declivity', 'defacement', 'default', 'defloration', 'deflower', 'deformation', 'degenerate', 'departure', 'deprivation', 'deprive', 'descent', 'deteriorate', 'deterioration', 'detriment', 'detumescence', 'devolve', 'diminution', 'dip', 'disfiguration', 'disfigurement', 'distortion', 'disturb', 'disuse', 'downhill', 'downslope', 'drop', 'drop_away', 'drop_off', 'eat', 'eat_away', 'ebb', 'ebbing', 'endangerment', 'epilation', 'erode', 'erosion', 'exit', 'expiration', 'exponential_decay', 'exponential_return', 'fail', 'fall', 'fall_away', 'financial_loss', 'flaw', 'forfeit', 'forfeiture', 'fret', 'frost', 'gamble', 'go_down', 'go_for_broke', 'going', 'harm', 'hazard', 'health_hazard', 'hurt', 'impair', 'impairment', 'impoveris

## Section 2: Generate a semantic vector map with word2vec

In [None]:
from gensim.models.word2vec import Word2Vec
import os
from os import path

## This function organizes corpus as list of sentences, and each sentence as a list of words,
## as input to the function WordVec

## Collect preprocessed texts in txt format
root_folder = os.getcwd()
print(root_folder)

## This will be a list of clean sentences
word2vec_input = []

## This iterates over your path, folders and subfolders looking for txt files
for path, subdirs, files in os.walk(root_folder):
    for file in files:
        if '.txt' in file and 'model' not in file:
            print(file)
            name = os.path.join(path, file)
        
            file_text = open(name, encoding = 'utf-8',errors='ignore').read()

            ## this creates the list of paragraphs - lines
            text_list_paragraphs = file_text.split('\n')

            ## this will clean the paragraphs further -- getting read of \r at the end of the line
            for paragraph in text_list_paragraphs:
                #print (paragraph)
                
                paragraph = paragraph.replace('\r', '')

                ## we add the paragraphs to the word2vec input list
                word2vec_input += [paragraph.split(' ')]
            
                #print(word2vec_input)

/content
Vivekh etal-2015-Desalination.txt
Zisisetal-2006-heat.txt
esfahankalteh-2020-Achieving.txt
ismail-2012-energy eff.txt
Sun-2020-water shed water pollution.txt
Lazzarin etal-2013-annual air conditioning.txt
auid etal-2013-Organic Rankine cycles.txt
sampedro etal-2016-Spanish.txt
haghighi and maerefat-2014-Design.txt
fang etal-2017-Experimental.txt
zhang etal-2010-performanc.txt
Maheshwari etal-2009-performance analysis.txt
nygard-2012-Review.txt
hegazy etal-2017-The living building.txt
khoshbazan etal-2018-Thermo economic analy.txt
renato etal-2009-Energetic.txt
Newton etal-2011-nanoparticle.txt
amhadi and assaf-2019-Assessment.txt
Ashouri etal-2015-organic rankine cycle.txt
yu-2018-Theeconomic.txt
zishang-2012-Assessment.txt
salman-2020-Thermal.txt
jiang etal-2010-solar thermosyphon systems.txt
Riffat etal-2013-experimental investigation.txt
shuai-2018-Whatdo.txt
sawant-2011-Performance.txt
teixeira-2010-Temperature.txt
spataru-2010-Domestic energy and occupancy.txt
rao-2013-An

In [None]:
## Here we build the vector space with Word2Vec

SentenceCorpus = word2vec_input
word2vec_output = Word2Vec(SentenceCorpus, min_count=1)

In [None]:
## Save vector space

word2vec_output.save('Bio_Loss_w2v_model.txt')

## Section 3. Use the vector semantic map to evaluate if the bags of words created in section 1 are ecologically valid

In [None]:
###  funtion to use word2vec to inquiry about the 10 most similar semantically words to each seed word in word_list

def get_word2vec_list(word_list,model):

  list_of_word2vec_lists = []
  for word in word_list:
    try:

      ## here is the crucial line - we are using the model that we trained to get the most similar words within our corpus
      list_vects=model.wv.most_similar([word],topn=10)

      new_list = []
      new_list +=[word]
      for item in list_vects:
        word1 = item[0]
        new_list += [word1]

      #print(new_list)
      #print('\n')
      list_of_word2vec_lists += [new_list]


    
    except KeyError:
      continue
  return(list_of_word2vec_lists)

In [None]:
## open vector space 
model = Word2Vec.load('Bio_Loss_w2v_model.txt')


In [None]:
## get word2vec list of 10 most similar words for the climate bag of words

list_of_biodiversity_w2v = get_word2vec_list(biodiversity_list,model)

index =0
for w2v_list in list_of_biodiversity_w2v:
  print(index, w2v_list)
  index +=1

0 ['area', 'area,', 'floor', 'area.', 'length', 'size', 'roof', 'shape', 'elemental', 'wall', 'height']
1 ['arena', 'Interstitial', '(Emission', 'water-intensive', 'NZEB', 'indication', 'Trading)', 'innocent', 'end-of-life', 'oxygen-steam', 'politic,”']
2 ['background', 'fragmentation', 'privacy', 'establishment', 'historical', 'employees', 'language', 'Omani', 'support.', 'object', 'SLR']
3 ['biodiversity', 'parks,', '140,', 'flood', 'pesticide', 'Japan).', 'extinguishing,', '150,', 'facilities,', 'contracts,', 'Mousazadeh']
4 ['born', 'EP/N010779/1,', 'Studio', 'Grave', 'Security,', 'Optics.', 'visions', 'N2ase', 'Jonathan', 'Tropics', 'Exposure']
5 ['circumstance', 'true', 'seemingly', 'efficient;', 'comparatively', 'perhaps', 'educated', 'lacking', 'valid.', 'draft”', 'indeed']
6 ['conservancy', 'Extractive', '5–Cooling', 'Milli-QTM', 'scarcity.', 'electrolysis;', 'reticulation,', 'drowning,', 'Chilled', '2004;4:41–', 'heater/chiller']
7 ['conservation', 'kinetic', 'embodied', 'con

In [None]:
## chose from the word2vec outputs, the lists that seem to have clouds of meanings coherent with climate

relevant_biodiversity_w2v_words = list( list_of_biodiversity_w2v[i] for i in [0,1,2,3,4,5,7])
                                     
## Add all the words into one final bags of words
biodiversity_BoW = [item for sublist in relevant_biodiversity_w2v_words for item in sublist]

print(biodiversity_BoW)

['area', 'area,', 'floor', 'area.', 'length', 'size', 'roof', 'shape', 'elemental', 'wall', 'height', 'arena', 'Interstitial', '(Emission', 'water-intensive', 'NZEB', 'indication', 'Trading)', 'innocent', 'end-of-life', 'oxygen-steam', 'politic,”', 'background', 'fragmentation', 'privacy', 'establishment', 'historical', 'employees', 'language', 'Omani', 'support.', 'object', 'SLR', 'biodiversity', 'parks,', '140,', 'flood', 'pesticide', 'Japan).', 'extinguishing,', '150,', 'facilities,', 'contracts,', 'Mousazadeh', 'born', 'EP/N010779/1,', 'Studio', 'Grave', 'Security,', 'Optics.', 'visions', 'N2ase', 'Jonathan', 'Tropics', 'Exposure', 'circumstance', 'true', 'seemingly', 'efficient;', 'comparatively', 'perhaps', 'educated', 'lacking', 'valid.', 'draft”', 'indeed', 'conservation', 'kinetic', 'embodied', 'conservation.', 'balance', 'storage—selection,', 'balances', 'conservation,', 'Net-zero', 'audit', 'undersupply).']


In [None]:
## get word2vec words for loss

list_of_loss_w2v = get_word2vec_list(loss_list,model)

index =0
for w2v_list in list_of_loss_w2v:
  print(index, w2v_list)
  index +=1

0 ['break', 'root', 'þ208C', 'Sultan', 'stop', 'ml,', 'disorder.', 'online', '0.056', '908)', 'readers']
1 ['burn', 'NG', 'psc,', 'Easy', 'multimeter', 'ligands', 'CO2/kg', 'acetylene', 'transnational', 'grease', 'Adaption']
2 ['chance', 'necessity', 'stop', 'push', 'donor', 'lose', 'anything', 'disorder', 'population,', 'U.S.,', 'expand']
3 ['correct', 'accept', 'information,', 'thinking?', 'RL', 'agent', 'construct', 'choose', 'press', 'discuss', 'acknowledging']
4 ['corrode', 'unarguably', 'differ.', 'openair', 'away.”', 'consolidate', 'viruses,', 'Acetogenesis', 'ecosystems', 'supplant', 'be,']
5 ['damage', 'contributing', 'risks', 'degradation', 'prevent', 'substantially', 'harm', 'allergens', 'commitment', 'face', 'UHI']
6 ['decay', 'stratosphere', 'vapors', 'mining', 'blinds', 'ions', 'transitions', 'itself,', 'aircraft', 'gob', 'contaminants']
7 ['declination', 'altitude', 'distillation—', 'simulator', 'δis', 'cooker', '5840', 'location;', 'reflectance,', 'tracking;', 'lab,']
8

In [None]:
## chose from the word2vec outputs, the lists that seem to have clouds of meanings coherent with loss
## in this example we choose indexes 3,6,9,11,13 and 14
relevant_loss_w2v_words = list( list_of_loss_w2v[i] for i in [0,1,4,5,6,7,8,10,11,12,13,14,15,16,17,18,20,21,23,27,28,29,30,33,34,35,36,37,38,39,43,44,47,56,57,62,63,64,65,66])


## Add all the words into one final bags of words
loss_BoW = [item for sublist in relevant_loss_w2v_words for item in sublist]

print(loss_BoW)

['break', 'root', 'þ208C', 'Sultan', 'stop', 'ml,', 'disorder.', 'online', '0.056', '908)', 'readers', 'burn', 'NG', 'psc,', 'Easy', 'multimeter', 'ligands', 'CO2/kg', 'acetylene', 'transnational', 'grease', 'Adaption', 'corrode', 'unarguably', 'differ.', 'openair', 'away.”', 'consolidate', 'viruses,', 'Acetogenesis', 'ecosystems', 'supplant', 'be,', 'damage', 'contributing', 'risks', 'degradation', 'prevent', 'substantially', 'harm', 'allergens', 'commitment', 'face', 'UHI', 'decay', 'stratosphere', 'vapors', 'mining', 'blinds', 'ions', 'transitions', 'itself,', 'aircraft', 'gob', 'contaminants', 'declination', 'altitude', 'distillation—', 'simulator', 'δis', 'cooker', '5840', 'location;', 'reflectance,', 'tracking;', 'lab,', 'decline', 'proximity', 'diverter', 'Gulf.', 'confidence', 'alteration', '23%', 'shift', 'root', 'subsequent', '8%', 'deformation', 'loading', 'hydraulic', 'rock', 'lateral', 'static', 'path', 'blade', 'porosity', 'compressive', 'displacement', 'degenerate', 'old

In [None]:
## An hypothetical bag of words was thus obtained, which can be used for frequency analyses
## See next script

print('biodiversity',biodiversity_BoW)
print('\n')
print('loss',loss_BoW)

biodiversity ['area', 'area,', 'floor', 'area.', 'length', 'size', 'roof', 'shape', 'elemental', 'wall', 'height', 'arena', 'Interstitial', '(Emission', 'water-intensive', 'NZEB', 'indication', 'Trading)', 'innocent', 'end-of-life', 'oxygen-steam', 'politic,”', 'background', 'fragmentation', 'privacy', 'establishment', 'historical', 'employees', 'language', 'Omani', 'support.', 'object', 'SLR', 'biodiversity', 'parks,', '140,', 'flood', 'pesticide', 'Japan).', 'extinguishing,', '150,', 'facilities,', 'contracts,', 'Mousazadeh', 'born', 'EP/N010779/1,', 'Studio', 'Grave', 'Security,', 'Optics.', 'visions', 'N2ase', 'Jonathan', 'Tropics', 'Exposure', 'circumstance', 'true', 'seemingly', 'efficient;', 'comparatively', 'perhaps', 'educated', 'lacking', 'valid.', 'draft”', 'indeed', 'conservation', 'kinetic', 'embodied', 'conservation.', 'balance', 'storage—selection,', 'balances', 'conservation,', 'Net-zero', 'audit', 'undersupply).']


loss ['break', 'root', 'þ208C', 'Sultan', 'stop', 'ml