# In this script we will learn how to extract bags-of-words to perform dyachronic analysis of historical texts.

## Section 1
### First we use word lists from psychometric tools and obtain synonyms and hyponyms using WordNet

In [None]:
## The first time you use this script, uncomment lines 3 and 5

#!pip install nltk
import nltk
#nltk.download('all')
from nltk.corpus import wordnet as wn

# Here we can change the langauge of analysis
languages = ['eng']


In [None]:
### Function to Generate synonyms and hyponyms using Wordnet

def generate_wordnet_list(word_base,languages,exclude_list):
  word_lists = []

  #iterate over languages
  for language in languages:
    word_list = []

    # iterate over words in the word_base list
    for word in word_base:

      ## iterate over different meanings of the word synsets
      for meaning in wn.synsets(word, pos=wn.NOUN+wn.VERB+wn.ADJ):
        if meaning.name() not in exclude_list:
          print(meaning)
          print(meaning.definition())
          print('\n')


          ## iterate over different synonyms and add them to list
          for synonym in meaning.lemmas(language):
            #print(synonym)

            ## if the synonym is no in the list add it
            if synonym.name() not in word_list:
              word_list.append(synonym.name())
              #print(synonym.name())

          ## iterate over hyponyms
          for hyponym in meaning.hyponyms():
            #print(hyponym.lemmas(language))

            ## iterate over synonyms of hyponyms
            for synonym_of_hyponym in hyponym.lemmas(language):
              #print(synonym_of_hyponym.name())

              ## if the synonym of the hyponym is no in the list add it
              if synonym_of_hyponym.name() not in word_list:
                word_list.append(synonym_of_hyponym.name())

    ## add the language-specific word list to the global word list, sort the words in alphabetic order   
    word_lists +=[sorted(word_list)]

  ## there are 3 lists now, one for each language, we iterate over the list of lists and print each
  for lst in word_lists:
    #print(lst)
    return(lst)

In [None]:
## Example of seed-words related to a target concept - for instance, clmate. 
## In this case, I obtained based seed word list from:
# Ishaya, S., & Abaje, I. B. (2008). Indigenous people's perception on climate change and adaptation strategies in Jema'a local government area of Kaduna State, Nigeria. Journal of geography and regional planning, 1(8), 138.
# Ayalew, M. S., Demissie, G. D., Muchie, K. F., Tadesse, S., & Alemu, K. PERCEPTION OF CLIMATE CHANGE AND ASSOCIATED FACTORS AMONG RURAL DWELLERS OF GONDAR ZURIA DISTRICT, NORTHWEST ETHIOPIA.
word_base_climate = ['Drought','typhoon', 'landslide','flood','Temperature','weather','environment','rainfall','weather','dry','adaptation','season','natural','ecosystems']
## Using the function above, we can obtain the list of synonyms and hyponyms of the seed words,
## We can also exclude word meanings that are irrelvant for the concept we want to measure

## For instance, 'quilt' is not related to climate, and we can add it to the exclusion list
## We run the script until all the words in the list of synsets displayed below have relevant meanings 

exclude_list_climate = ['landslide.n.01','flood.n.03','weather.v.01','weather.v.01','weather.v.03','dry.n.01','dry.s.02','dry.a.04','dry.a.05','dry.a.06','dry.a.07','dry.s.08','dry.s.09','dry.s.10','dry.s.12','dry.s.15','dry.s.16',
                       'adaptation.n.01','adaptation.n.03','season.n.03','season.v.01','season.v.02','temper.v.04','natural.n.01','natural.n.02','natural.n.03','natural.a.05','natural.s.06','natural.s.07','natural.s.08','natural.s.09','lifelike.s.02',
                       'dry.s.13','dry.s.14']
climate_list = generate_wordnet_list(word_base_climate,languages,exclude_list_climate)


Synset('drought.n.01')
a shortage of rainfall


Synset('drought.n.02')
a prolonged shortage


Synset('typhoon.n.01')
a tropical cyclone occurring in the western Pacific or Indian oceans


Synset('landslide.n.02')
a slide of a large mass of dirt and rock down a mountain or cliff


Synset('flood.n.01')
the rising of a body of water and its overflowing onto normally dry land


Synset('flood.n.02')
an overwhelming number or amount


Synset('flood.n.04')
a large flow


Synset('flood.n.05')
the act of flooding; filling to overflowing


Synset('flood_tide.n.02')
the occurrence of incoming water (between a low tide and the following high tide);  -Shakespeare


Synset('deluge.v.01')
fill quickly beyond capacity; as with a liquid


Synset('flood.v.02')
cover with liquid, usually water


Synset('flood.v.03')
supply with an excess of


Synset('flood.v.04')
become filled to overflowing


Synset('temperature.n.01')
the degree of hotness or coldness of a body or environment (corresponding to its mole

In [None]:
## This is a very broad list which includes all possible synonyms and hyponyms 
## related to the seed words we chose for climate

## As one can verify, this list needs prunning 
## some of it by excluding irrelevant synsets a priori (cell above), 
## but some of it can also be done manually a posteriori

print(climate_list)

['Curie_point', 'Curie_temperature', 'Noachian_deluge', "Noah's_flood", 'Noah_and_the_Flood', 'Whitsun', 'Whitsuntide', 'Whitweek', 'absolute_temperature', 'absolute_zero', 'acclimation', 'acclimatisation', 'acclimatization', 'adaptation', 'adaption', 'adjustment', 'air', 'air_current', 'alluvion', 'ambiance', 'ambience', 'area', 'arena', 'atmosphere', 'atmospheric_condition', 'atmospheric_state', 'autumn', 'background', 'bad_weather', 'baseball_season', 'basketball_season', 'blood_heat', 'blow-dry', 'body_temperature', 'boil', 'boiling_point', 'circumstance', 'cloudburst', 'cold', 'cold_weather', 'coldness', 'comfort_zone', 'conditions', 'context', 'current_of_air', 'debacle', 'dedifferentiation', 'dehumidify', 'dehydrate', 'deluge', 'desiccate', 'dew_point', 'differentiation', 'domain', 'domestication', 'downfall', 'downpour', 'drench', 'drip-dry', 'drizzle', 'drought', 'drouth', 'dry', 'dry_out', 'dry_season', 'dry_up', 'ecology', 'ecosystem', 'effusion', 'element', 'elements', 'env

In [None]:
## THREAT

#Ishaya, S., & Abaje, I. B. (2008). Indigenous people's perception on climate change and adaptation strategies in Jema'a local government area of Kaduna State, Nigeria. Journal of geography and regional planning, 1(8), 138.
#Pickson, R. B., & He, G. (2021). Smallholder Farmers’ Perceptions, Adaptation Constraints, and Determinants of Adaptive Capacity to Climate Change in Chengdu. SAGE Open. https://doi.org/10.1177/21582440211032638

exclude_list_threat = ['vulnerable.s.02','decreasing.a.02','poor_people.n.01','hapless.s.01','poor.a.02','poor.a.03','incidence.n.02','annihilating.s.02','switch.v.03','change.v.05','exchange.v.01','transfer.v.06','deepen.v.04','change.v.10','challenge.n.01','challenge.n.02','challenge.n.03','challenge.n.04','challenge.n.05','challenge.v.01','challenge.v.02','challenge.v.03','challenge.v.04','diminish.v.02','waste.n.01','waste.n.02','waste.n.05','waste.v.01','waste.v.02','neutralize.v.04','consume.v.03','godforsaken.s.01','critical.a.01',
                      'critical.a.03','critical.s.04','battle.n.01','gamble.v.01','competitiveness.n.01','fight.n.04','fight.n.05','contend.v.06','pest.n.03','supplanting.n.01','translation.n.07','displacement.n.04','displacement.n.07','loss.n.05','loss.n.06','personnel_casualty.n.01','refuse.v.02','refuse.v.01','decline.v.07','obstruct.v.02','flimsy.s.03''constraint.n.01','constraint.n.03','negative.n.01',
                       'negative.n.02','veto.v.01','negative.a.04','negative.s.06','damaging.s.02','negative.s.08','vulnerable.s.02','decreasing.a.02','poor.a.02','vulnerable.s.02','poor_people.n.01','poor.a.02','poor.a.03','incidence.n.02','annihilating.s.02','change.v.05','change.v.06','exchange.v.01','transfer.v.06']

word_base_threat = ['unpredictable', 'risk', 'hindrances', 'Lack','vulnerable','decreasing','Poor','incidence','devastating','changing','challenges','repercussions' 'scarcity', 'diminishing', 'waste', 'critical','fight','disaster','exacerbated','severity','adverse','pest','depletion','endanger','displacement','loss','decline','impeded','fragile','inundation','constraints','negative','problem']
threat_list = generate_wordnet_list(word_base_threat,languages,exclude_list_education)


Synset('unpredictable.a.01')
not capable of being foretold


Synset('unpredictable.s.02')
unknown in advance


Synset('irregular.s.02')
not occurring at expected times


Synset('hazard.n.01')
a source of danger; a possibility of incurring loss or misfortune


Synset('risk.n.02')
a venture undertaken without regard to possible loss or injury


Synset('risk.n.03')
the probability of becoming infected given that exposure to an infectious agent has occurred


Synset('risk.n.04')
the probability of being exposed to an infectious agent


Synset('risk.v.01')
expose to a chance of loss or damage


Synset('gamble.v.01')
take a risk in the hope of a favorable outcome


Synset('hindrance.n.01')
something immaterial that interferes with or delays action or progress


Synset('hindrance.n.02')
any obstruction that impedes or is burdensome


Synset('hindrance.n.03')
the act of hindering or obstructing or impeding


Synset('lack.n.01')
the state of needing something that is absent or unavailable


Syn

In [None]:
print(threat_list)

['Americanise', 'Americanize', 'Armageddon', 'Europeanise', 'Europeanize', 'Frenchify', 'Gordian_knot', 'Islamise', 'Islamize', 'Noachian_deluge', "Noah's_flood", 'Noah_and_the_Flood', 'abate', 'abbreviate', 'abridge', 'absence', 'accelerate', 'accommodate', 'accustom', 'acerbate', 'acetylate', 'acetylise', 'acetylize', 'achromatise', 'achromatize', 'acquire', 'act_of_God', 'action', 'activate', 'adapt', 'add', 'adjust', 'adopt', 'adorn', 'adventure', 'adverse', 'aerate', "affaire_d'honneur", 'affect', 'affray', 'age', 'aggravate', 'agitate', 'air_bag', 'airbrake', 'albatross', 'alchemise', 'alchemize', 'alcoholise', 'alcoholize', 'alien', 'alienate', 'alkalinise', 'alkalinize', 'allegorise', 'allegorize', 'alluvion', 'alter', 'alternate', 'amalgamate', 'ameliorate', 'amend', 'amplitude', 'angulate', 'animalise', 'animalize', 'animate', 'animise', 'animize', 'annihilating', 'annihilative', 'antagonism', 'antiquate', 'antique', 'apocalypse', 'appeal', 'archaise', 'archaize', 'armed_comb

## Section 2: Generate a semantic vector map with word2vec

In [None]:
from gensim.models.word2vec import Word2Vec
import os
from os import path

## This function organizes corpus as list of sentences, and each sentence as a list of words,
## as input to the function WordVec

## Collect preprocessed texts in txt format
root_folder = os.getcwd()
print(root_folder)

## This will be a list of clean sentences
word2vec_input = []

## This iterates over your path, folders and subfolders looking for txt files
for path, subdirs, files in os.walk(root_folder):
    for file in files:
        if '.txt' in file and 'model' not in file:
            print(file)
            name = os.path.join(path, file)
        
            file_text = open(name, encoding = 'utf-8').read()

            ## this creates the list of paragraphs - lines
            text_list_paragraphs = file_text.split('\n')

            ## this will clean the paragraphs further -- getting read of \r at the end of the line
            for paragraph in text_list_paragraphs:
                #print (paragraph)
                
                paragraph = paragraph.replace('\r', '')

                ## we add the paragraphs to the word2vec input list
                word2vec_input += [paragraph.split(' ')]
            
                #print(word2vec_input)

C:\Users\azeez\Downloads\EM3 PROJECT - Copy\Revised Scripts 2to5and7
ABUBAKAR AND COMFORT-2021-corporate response to climate change.txt
Amanda Sabo-2016-impacts and responses knowledge community.txt
Amanda-2016-Water Sensitive Planning Practices.txt
Avil and David-2021-Carbon storage.txt
Benedict-2021-neocorporatism.txt
Djongmo et al-2021-Carbon management.txt
Dunia E. Santiago-2021-Energy use in hotels.txt
Jaeryoung et al-2021-Exploring the feasibility.txt
Liqiang etal-2021-frequency tracking.txt
Michel Gueldry-2018-Climate Change and public opinion.txt
RIA JHOANNA-2021-adaptations toward climate risk challenges.txt
Sikiru et al-2021-FarmerHerders’ Conflict and Climate Change.txt
Suyong et al-2021-Coevolutionary game of manufacturer's abatement behavior.txt
Tao et al-2021-multimodal transportation of medical aid.txt
Tianyi et al-2021-building thermal system.txt
Ufuk etal-2021-Intelligent energy optimization system.txt
Vivien etal-2021-Heat Waves.txt
Wang and Zhang-2021-Decision and co

In [None]:
## Here we build the vector space with Word2Vec

SentenceCorpus = word2vec_input
word2vec_output = Word2Vec(SentenceCorpus, min_count=1)

In [None]:
## Save vector space

word2vec_output.save('w2v_model.txt')

## Section 3. Use the vector semantic map to evaluate if the bags of words created in section 1 are ecologically valid

In [None]:
###  funtion to use word2vec to inquiry about the 10 most similar semantically words to each seed word in word_list

def get_word2vec_list(word_list,model):

  list_of_word2vec_lists = []
  for word in word_list:
    try:

      ## here is the crucial line - we are using the model that we trained to get the most similar words within our corpus
      list_vects=model.wv.most_similar([word],topn=10)

      new_list = []
      new_list +=[word]
      for item in list_vects:
        word1 = item[0]
        new_list += [word1]

      #print(new_list)
      #print('\n')
      list_of_word2vec_lists += [new_list]


    
    except KeyError:
      continue
  return(list_of_word2vec_lists)

In [None]:
## open vector space for english in the early modern period
model = Word2Vec.load('w2v_model.txt')


In [None]:
## get word2vec list of 10 most similar words for the prosociality bag of words

list_of_climate_w2v = get_word2vec_list(climate_list,model)

index =0
for w2v_list in list_of_climate_w2v:
  print(index, w2v_list)
  index +=1

0 ['adaptation', 'mitigation', 'strategies', 'impact', 'technology', 'risks', 'issues', 'policies', 'knowledge', 'effects', 'adverse']
1 ['adjustment', 'thermal', 'envelope', 'under', 'first', 'transfer', 'time', 'state', 'types', 'power', 'linear']
2 ['air', 'CC', 'along', 'around', 'rice', 'farming', 'over', 'include', 'specific', 'who', 'fracture']
3 ['area', 'data', 'each', 'process', 'land', 'was', 'risk', 'characteristics', 'developed', 'relevant', 'simulation']
4 ['arena', 'status,', 'Mapfori', 'USSR,', 'Maureen', '(25,', '(Dbh),', 'Society’s', '“BDMET:', 'Feeling', 'Care']
5 ['atmosphere', 'BIM', 'developed', 'water', 'local', 'internet', 'respondents', 'knowledge', 'issues', 'process', 'renewable']
6 ['background', 'aim', 'context', 'building', 'development', 'profits', 'combination', 'energy', 'objective', 'means', 'study']
7 ['cold', 'technical', 'per', 'embedded', 'including', 'relative', 'status', 'conflicts', 'prevailing', 'We', 'do']
8 ['conditions', 'sustainable', 'econ

In [None]:
## chose from the word2vec outputs, the lists that seem to have clouds of meanings coherent with prosociality
## in this example we can chose, for instance (among others) indexes 3,6,9,11,13 and 14

relevant_climate_w2v_words = list( list_of_climate_w2v[i] for i in [3,6,9,11,13,14])


## Add all the words into one final bags of words
climate_BoW = [item for sublist in relevant_climate_w2v_words for item in sublist]

print(climate_BoW)

['adjustment', 'some', 'specific', 'decision-making', 'sustainable', 'planning,', 'resources', 'plans', 'above', 'decision', 'models', 'cold', 'significantly', 'sustainable', 'stakeholders', 'lower', 'system.', 'specific', 'resources', 'plans', 'EU', 'software', 'drought', 'combination', 'chain', 'exchange', 'objective', 'profits', 'part', 'application', 'role', 'study.', 'context', 'elements', 'lack', 'profit', 'requirements', 'Delta', 'management', 'amount', 'architecture', 'two', 'service', 'function', 'fertilizer', '2017).', '2011).', '2009).', '(2017),', '2012).', 'Smith', 'regulation.', '(2017)', '[26]', '2019).', 'flood', '(1)', 'solar', 'Carbon', 'etc.', 'group', 'forest', 'sensors,', 'political', 'embedded', 'Their']


In [None]:
## get word2vec words for authoritarianism

list_of_threat_w2v = get_word2vec_list(threat_list,model)

index =0
for w2v_list in list_of_threat_w2v:
  print(index, w2v_list)
  index +=1

0 ['argument', 'intended', '(islands', 'proof', 'hot,', 'equilibrium', 'import', 'Contentious', 'ancient', 'otherwise', '(ϕ1']
1 ['ask', 'Saarinen', 'Assis,', 'preserved', '“rise”', '2017)', 'welcome', 'additive', 'suggestions', 'transgression.', 'fits']
2 ['battle', 'that', 'used', 'single', 'increase', 'is', 'necessary', 'decrease', 'comes', 'full', 'play']
3 ['burn', 'plan.', 'k', 'CES', 'stocking', 'Zealand.', 'Lanka,', 'Adeyemi,', 'Seolgye-e', 'Unanticipated', 'Simultaneously,']
4 ['call', 'found', 'enhanced', 'explained', 'achieved', 'implemented', 'equal', 'likely', 'easily', 'calculated', 'Figures']
5 ['chance', 'brief', 'approach.', 'residential', '10.', 'stakeholder', 'low-carbon', 'compare', 'PMBOK', 'operations.', 'design.']
6 ['claim', 'national', 'all', 'strategy', 'integrating', 'rate', 'provided', 'access', 'were', 'mainly', 'temperature']
7 ['collide', '(Agricultural', 'desalination', 'automation', 'forum', 'Last', 'Factors', 'Korea', '(IPCC', 'Korea:', '[3].']
8 ['col

In [None]:
## chose from the word2vec outputs, the lists that seem to have clouds of meanings coherent with prosociality
## in this example we choose indexes 3,6,9,11,13 and 14
relevant_threat_w2v_words = list( list_of_threat_w2v[i] for i in [4,12,15,38,48])


## Add all the words into one final bags of words
threat_BoW = [item for sublist in relevant_threat_w2v_words for item in sublist]

print(threat_BoW)

['call', 'found', 'enhanced', 'explained', 'achieved', 'implemented', 'equal', 'likely', 'easily', 'calculated', 'Figures', 'consume', 'Oxford', 'York', 'UTCI', '(6):', '[A', 'unrestricted', '26', '2021;', 'Anglia', 'Trinidad', 'cost', 'under', 'between', 'thermal', 'many', 'total', 'main', 'SLR', 'emission', 'value', 'state', 'govern', 'selection', 'savannah', 'example,', 'group', 'formal', 'WSN', 'mean', 'wireless', 'percent', 'seasons', 'rising', 'medical', 'selected', 'past', 'personal', 'year', 'pressure', 'regional', 'particular', 'wave', 'area,']


In [None]:
## An hypothetical bag of words was thus obtained, which can be used for frequency analyses
## See next script

print('climate',climate_BoW)
print('\n')
print('threat',threat_BoW)

climate ['adjustment', 'some', 'specific', 'decision-making', 'sustainable', 'planning,', 'resources', 'plans', 'above', 'decision', 'models', 'cold', 'significantly', 'sustainable', 'stakeholders', 'lower', 'system.', 'specific', 'resources', 'plans', 'EU', 'software', 'drought', 'combination', 'chain', 'exchange', 'objective', 'profits', 'part', 'application', 'role', 'study.', 'context', 'elements', 'lack', 'profit', 'requirements', 'Delta', 'management', 'amount', 'architecture', 'two', 'service', 'function', 'fertilizer', '2017).', '2011).', '2009).', '(2017),', '2012).', 'Smith', 'regulation.', '(2017)', '[26]', '2019).', 'flood', '(1)', 'solar', 'Carbon', 'etc.', 'group', 'forest', 'sensors,', 'political', 'embedded', 'Their']


threat ['call', 'found', 'enhanced', 'explained', 'achieved', 'implemented', 'equal', 'likely', 'easily', 'calculated', 'Figures', 'consume', 'Oxford', 'York', 'UTCI', '(6):', '[A', 'unrestricted', '26', '2021;', 'Anglia', 'Trinidad', 'cost', 'under', 'b