# In this script we will learn how to extract bags-of-words to perform dyachronic analysis of historical texts.

## Section 1
### First we use word lists from psychometric tools and obtain synonyms and hyponyms using WordNet

In [None]:
## The first time you use this script, uncomment lines 3 and 5

!pip install nltk
import nltk
#nltk.download('all')
from nltk.corpus import wordnet as wn

# Here we can change the langauge of analysis
languages = ['eng']
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [None]:
### Function to Generate synonyms and hyponyms using Wordnet

def generate_wordnet_list(word_base,languages,exclude_list):
  word_lists = []

  #iterate over languages
  for language in languages:
    word_list = []

    # iterate over words in the word_base list
    for word in word_base:

      ## iterate over different meanings of the word synsets
      for meaning in wn.synsets(word, pos=wn.NOUN+wn.VERB+wn.ADJ):
        if meaning.name() not in exclude_list:
          print(meaning)
          print(meaning.definition())
          print('\n')


          ## iterate over different synonyms and add them to list
          for synonym in meaning.lemmas(language):
            #print(synonym)

            ## if the synonym is no in the list add it
            if synonym.name() not in word_list:
              word_list.append(synonym.name())
              #print(synonym.name())

          ## iterate over hyponyms
          for hyponym in meaning.hyponyms():
            #print(hyponym.lemmas(language))

            ## iterate over synonyms of hyponyms
            for synonym_of_hyponym in hyponym.lemmas(language):
              #print(synonym_of_hyponym.name())

              ## if the synonym of the hyponym is no in the list add it
              if synonym_of_hyponym.name() not in word_list:
                word_list.append(synonym_of_hyponym.name())

    ## add the language-specific word list to the global word list, sort the words in alphabetic order   
    word_lists +=[sorted(word_list)]

  ## there are 3 lists now, one for each language, we iterate over the list of lists and print each
  for lst in word_lists:
    #print(lst)
    return(lst)

In [None]:
## Example of seed-words related to a target concept - for instance, prosociality. 
## In this case, we obtained based the seed word list from

# R. Baumsteiger, J. T. Siegel, Measuring prosociality: The development of a prosocial
# behavioral intentions scale. J. Pers. Assess. 101, 305–314 (2019).

word_base_climate = ['Drought', 'climate change', 'unpredictable', 'typhoon', 'landslide', 'flood', 'Temperature', 'pests', 'Drought tolerant', 'fertilizer', 'Unpredictable', 'weather', 'labor', 'adaptation']

## Using the function above, we can obtain the list of synonyms and hyponyms of the seed words,
## We can also exclude word meanings that are irrelvant for the concept we want to measure

## For instance, 'quilt' is not related to prosociality, and we can add it to the exclusion list
## We run the script until all the words in the list of synsets displayed below have relevant meanings 

exclude_list_climate = ['weather.v.03']
climate_list = generate_wordnet_list(word_base_climate,languages,exclude_list_climate)


Synset('drought.n.01')
a shortage of rainfall


Synset('drought.n.02')
a prolonged shortage


Synset('unpredictable.a.01')
not capable of being foretold


Synset('unpredictable.s.02')
unknown in advance


Synset('irregular.s.02')
not occurring at expected times


Synset('typhoon.n.01')
a tropical cyclone occurring in the western Pacific or Indian oceans


Synset('landslide.n.01')
an overwhelming electoral victory


Synset('landslide.n.02')
a slide of a large mass of dirt and rock down a mountain or cliff


Synset('flood.n.01')
the rising of a body of water and its overflowing onto normally dry land


Synset('flood.n.02')
an overwhelming number or amount


Synset('flood.n.03')
light that is a source of artificial illumination having a broad beam; used in photography


Synset('flood.n.04')
a large flow


Synset('flood.n.05')
the act of flooding; filling to overflowing


Synset('flood_tide.n.02')
the occurrence of incoming water (between a low tide and the following high tide)


Synset('d

In [None]:
## This is a very broad list which includes all possible synonyms and hyponyms 
## related to the seed words we chose for prosociality

## As one can verify, this list needs prunning 
## some of it by excluding irrelevant synsets a priori (cell above), 
## but some of it can also be done manually a posteriori

print(climate_list)

['British_Labour_Party', 'Curie_point', 'Curie_temperature', 'Department_of_Labor', 'DoL', 'I.W.W.', 'IWW', 'Industrial_Workers_of_the_World', 'Labor', 'Labor_Department', 'Labour', 'Labour_Party', 'Noachian_deluge', "Noah's_flood", 'Noah_and_the_Flood', 'absolute_temperature', 'absolute_zero', 'acclimation', 'acclimatisation', 'acclimatization', 'adaptation', 'adaption', 'adjustment', 'adventure', 'air_current', 'alluvion', 'assignment', 'atmosphere', 'atmospheric_condition', 'atmospheric_state', 'baby', 'bad_weather', 'blighter', 'blood_heat', 'body_temperature', 'boil', 'boiling_point', 'brave', 'brave_out', 'breeze', 'bubonic_plague', "child's_play", 'childbed', 'cinch', 'cold', 'cold_weather', 'coldness', 'comfort_zone', 'conditions', 'confinement', 'corvee', 'current_of_air', 'cuss', 'dangerous_undertaking', 'dark_adaptation', 'debacle', 'dedifferentiation', 'deluge', 'dew_point', 'differentiation', 'dig', 'domestication', 'donkeywork', 'downfall', 'drench', 'drive', 'drought', '

In [None]:
## We can repeat the process for the control concept of prosociality -- e.g. authoritarianism

#T. Toharudin, J. H. L. Oud, J. B. Billiet, H. Folmer, “Measuring authoritarianism with
#different sets of items in a longitudinal study” in Methods, Theories, and Empirical
#Applications in the Social Sciences, S. Salzborn, E. Davidov, J. Reinecke, Eds. (VS Verlag
#für Sozialwissenschaften, 2012), pp. 193–201.

exclude_list_conflict = []
word_base_conflict = ['conflict', 'risk', 'competition', 'volatility', 'scarcity', 'demanding', 'waste', 'critical']
conflict_list = generate_wordnet_list(word_base_conflict,languages,exclude_list_conflict)


Synset('conflict.n.01')
an open clash between two opposing groups (or individuals)


Synset('conflict.n.02')
opposition between two simultaneous but incompatible feelings


Synset('battle.n.01')
a hostile meeting of opposing military forces in the course of a war


Synset('conflict.n.04')
a state of opposition between persons or ideas or interests


Synset('conflict.n.05')
an incompatibility of dates or events


Synset('conflict.n.06')
opposition in a work of drama or fiction between characters or forces (especially an opposition that motivates the development of the plot)


Synset('dispute.n.01')
a disagreement or argument about something important


Synset('conflict.v.01')
be in conflict


Synset('conflict.v.02')
go against, as of rules and laws


Synset('hazard.n.01')
a source of danger; a possibility of incurring loss or misfortune


Synset('risk.n.02')
a venture undertaken without regard to possible loss or injury


Synset('risk.n.03')
the probability of becoming infected given th

In [None]:
print(conflict_list)

['Armageddon', 'adventure', 'arguing', 'argument', 'armed_combat', 'ask', 'assault', 'athletic_competition', 'athletic_contest', 'athletics', 'barren', 'battle', 'bell_the_cat', 'blow', 'body_waste', 'boiling_point', 'boondoggle', 'bout', 'burn', 'call', 'call_for', 'call_in', 'challenger', 'champ', 'champion', 'championship', 'chance', 'chicken', 'claim', 'clamor', 'clamour', 'clash', 'class_struggle', 'class_war', 'class_warfare', 'cliffhanger', 'collide', 'collision', 'combat', 'comer', 'command', 'compel', 'competition', 'competitor', 'conflict', 'consume', 'contender', 'contention', 'contest', 'contestation', 'contravene', 'controversy', 'cost', 'counterinsurgency', 'crapshoot', 'critical', 'crud', 'cry_for', 'cry_out_for', 'danger', 'dearth', 'decisive', 'demand', 'demanding', 'desolate', 'devastate', 'difference', 'difference_of_opinion', 'disagreement', 'disceptation', 'disputation', 'dispute', 'dissension', 'dissipate', 'dissipation', 'dissonance', 'do_in', 'dogfight', 'draw',

## Section 2: Generate a semantic vector map with word2vec

In [None]:
from gensim.models.word2vec import Word2Vec
import os
from os import path

## This function organizes corpus as list of sentences, and each sentence as a list of words,
## as input to the function WordVec

## Collect preprocessed texts in txt format
root_folder = os.getcwd()
print(root_folder)

## This will be a list of clean sentences
word2vec_input = []

## This iterates over your path, folders and subfolders looking for txt files
for path, subdirs, files in os.walk(root_folder):
    for file in files:
        if '.txt' in file and 'model' not in file:
            print(file)
            name = os.path.join(path, file)
        
            file_text = open(name, encoding = 'utf-8').read()

            ## this creates the list of paragraphs - lines
            text_list_paragraphs = file_text.split('\n')

            ## this will clean the paragraphs further -- getting read of \r at the end of the line
            for paragraph in text_list_paragraphs:
                #print (paragraph)
                
                paragraph = paragraph.replace('\r', '')

                ## we add the paragraphs to the word2vec input list
                word2vec_input += [paragraph.split(' ')]
            
                #print(word2vec_input)

/content
heatwaves-2021.txt
Energy use in hotels-2021.txt
changeimpact-2016.txt
energy optimization-2021.txt
climaterisk-2021.txt
Carbon management-2021.txt
Fault Reactivation-2021.txt
neocorporatism-2021.txt
Herders’ Conflict-2021.txt
building thermal system-2021.txt
dead water effect-2021.txt
frequency tracking-2021.txt
exploring-2021.txt
luminescent coupling effect-2021.txt
corporateresponse-2021.txt
governance-2016.txt
multimodal transportation-2021.txt
Climate Change-2018.txt
Coevolutionary game-2021.txt
gamechanger-2021.txt
Decision-2021.txt


In [None]:
## Here we build the vector space with Word2Vec

SentenceCorpus = word2vec_input
word2vec_output = Word2Vec(SentenceCorpus, min_count=1)

In [None]:
## Save vector space

word2vec_output.save('w2v_modelclimate.txt')

## Section 3. Use the vector semantic map to evaluate if the bags of words created in section 1 are ecologically valid

In [None]:
###  funtion to use word2vec to inquiry about the 10 most similar semantically words to each seed word in word_list

def get_word2vec_list(word_list,model):

  list_of_word2vec_lists = []
  for word in word_list:
    try:

      ## here is the crucial line - we are using the model that we trained to get the most similar words within our corpus
      list_vects=model.wv.most_similar([word],topn=10)

      new_list = []
      new_list +=[word]
      for item in list_vects:
        word1 = item[0]
        new_list += [word1]

      #print(new_list)
      #print('\n')
      list_of_word2vec_lists += [new_list]


    
    except KeyError:
      continue
  return(list_of_word2vec_lists)

In [None]:
## open vector space for english in the early modern period
model = Word2Vec.load('w2v_modelclimate.txt')


In [None]:
## get word2vec list of 10 most similar words for the prosociality bag of words

list_of_climate_w2v = get_word2vec_list(climate_list,model)

index =0
for w2v_list in list_of_climate_w2v:
  print(index, w2v_list)
  index +=1

0 ['Labor', 'Gökçe', 'III', '[2]', 'C.,', 'S,', 'Ngaoundere,', 'Nauels,', 'BRAZIL’S', 'Default', 'Bragança,']
1 ['Labour', 'Water', 'Social', 'Analysis', 'Assessment', 'Urban', 'Information', 'Public', 'Review', 'Food', 'Center']
2 ['adaptation', 'strategies', 'response', 'technology', 'field', 'getting', 'impact', 'risks', 'adapt', 'effects', 'mitigation']
3 ['adjustment', 'BIM', 'level', 'local', 'improving', 'water', 'effect', 'rate', 'current', 'each', 'Revit']
4 ['assignment', 'Hogan,', 'Dani,', 'Anthony', 'MA,', 'S251–64.', 'STANDARD', 'inquiries,', 'Oper', 'Hangzhou', 'vs']
5 ['atmosphere', 'relationship', 'perceived', 'hand,', 'system', 'field', 'significant', 'planning', 'pastoralists', 'line', 'estimated']
6 ['cold', 'uses', 'plan', 'Under', 'scientific', 'office', 'authors', 'example,', 'substantive', 'months', 'makes']
7 ['conditions', 'traditional', 'many', 'power', 'through', 'scenarios', 'percent', 'lower', 'functions', 'reservoir', 'cost']
8 ['drive', 'each', 'system', 

In [None]:
## chose from the word2vec outputs, the lists that seem to have clouds of meanings coherent with prosociality
## in this example we can chose, for instance (among others) indexes 3,6,9,11,13 and 14

relevant_climate_w2v_words = list( list_of_climate_w2v[i] for i in [2,3,4,5,7,11,12,13,14,18,19,30,32,34,35,38,3,9])


## Add all the words into one final bags of words
climate_BoW = [item for sublist in relevant_climate_w2v_words for item in sublist]

print(climate_BoW)

['adaptation', 'strategies', 'response', 'technology', 'field', 'getting', 'impact', 'risks', 'adapt', 'effects', 'mitigation', 'adjustment', 'BIM', 'level', 'local', 'improving', 'water', 'effect', 'rate', 'current', 'each', 'Revit', 'assignment', 'Hogan,', 'Dani,', 'Anthony', 'MA,', 'S251–64.', 'STANDARD', 'inquiries,', 'Oper', 'Hangzhou', 'vs', 'atmosphere', 'relationship', 'perceived', 'hand,', 'system', 'field', 'significant', 'planning', 'pastoralists', 'line', 'estimated', 'conditions', 'traditional', 'many', 'power', 'through', 'scenarios', 'percent', 'lower', 'functions', 'reservoir', 'cost', 'elements', 'included', 'greater', 'association', 'tool', 'correlation', 'objective', 'retailer', 'case', 'major', 'consumption', 'enterprise', 'main', 'suppliers,', 'average', 'coordination', 'retailers,', 'kinds', 'motor', 'aid', 'profit', 'optimal', 'fertilizer', '2011).', '2010).', '2008).', 'Integrating', '[7]', '2014;', '2013).', 'Fan', '2019).', '2016;', 'flood', 'China', 'African'

In [None]:
## get word2vec words for authoritarianism

list_of_conflict_w2v = get_word2vec_list(conflict_list,model)

index =0
for w2v_list in list_of_conflict_w2v:
  print(index, w2v_list)
  index +=1

0 ['argument', 'design-driven', 'severely', 'mapped', 'overlay', 'A),', 'assumed', 'roads', 'ultimately', 'tracking.', 'accessible']
1 ['ask', 'metal.', 'do.', 'stay,', 'contrasting', 'protected.', 'phenomenon,', 'believed', 'omissions', '“locally', 'improved,']
2 ['battle', 'analysed', 'China,', 'transformation', 'cities.', 'context', 'building', 'procedure', 'in', 'study', 'reference']
3 ['burn', 'susceptibility', 'threaten', 'awakening', 'Δeϕ', 'Holocene,', 'normally', 'Azzu,', 'plain', 'concave', 'deliberative']
4 ['call', 'been', 'obtain', 'necessary', 'should', 'incorporated', 'likely', 'UDA,', 'may', 'It', 'enough']
5 ['chance', 'process', 'respondents', 'cooperation', 'definition', 'case', 'production.', 'applicable', 'new', 'chain', 'regime']
6 ['claim', 'horizontal', 'consumer', 'signal', 'leakage', 'farm', 'especially', 'over', 'When', 'progressive', 'Korean']
7 ['collide', 'editors,', 'Hub', 'compressor.', 'Hybrid', 'Minimum', 'governments,', 'Practices', 'derivative', 'dis

In [None]:
## chose from the word2vec outputs, the lists that seem to have clouds of meanings coherent with prosociality
## in this example we choose indexes 3,6,9,11,13 and 14
relevant_conflict_w2v_words = list( list_of_conflict_w2v[i] for i in [0,3,5,6,8,10,15,16,17,18,20,21,23,25,26,28,30,31,36,37,38,39,40,41,43,44,48,52,53,54,56,58,64])


## Add all the words into one final bags of words
conflict_BoW = [item for sublist in relevant_conflict_w2v_words for item in sublist]

print(conflict_BoW)

['argument', 'design-driven', 'severely', 'mapped', 'overlay', 'A),', 'assumed', 'roads', 'ultimately', 'tracking.', 'accessible', 'burn', 'susceptibility', 'threaten', 'awakening', 'Δeϕ', 'Holocene,', 'normally', 'Azzu,', 'plain', 'concave', 'deliberative', 'chance', 'process', 'respondents', 'cooperation', 'definition', 'case', 'production.', 'applicable', 'new', 'chain', 'regime', 'claim', 'horizontal', 'consumer', 'signal', 'leakage', 'farm', 'especially', 'over', 'When', 'progressive', 'Korean', 'collision', 'time,', 'determined', 'shows', 'level,', 'generated', 'same', 'interface', 'selecting', 'if', 'adopting', 'competition', 'water', 'was', 'changes', 'each', 'developed', 'buildings', 'risk', 'policy', 'internet', 'estimated', 'cost', 'governments', 'public', 'social', 'increasing', 'traditional', 'environment', 'material', 'consumers', 'when', 'many', 'critical', 'flow', 'With', 'studies', 'practice', 'no', 'designed', 'However,', 'requires', 'showed', 'is,', 'danger', 'deriva

In [None]:
## An hypothetical bag of words was thus obtained, which can be used for frequency analyses
## See next script

print('climate**************************************************************************************','\n',climate_BoW)
print('\n')
print('conflict######################################################################################','\n',conflict_BoW)

climate************************************************************************************** 
 ['adaptation', 'strategies', 'response', 'technology', 'field', 'getting', 'impact', 'risks', 'adapt', 'effects', 'mitigation', 'adjustment', 'BIM', 'level', 'local', 'improving', 'water', 'effect', 'rate', 'current', 'each', 'Revit', 'assignment', 'Hogan,', 'Dani,', 'Anthony', 'MA,', 'S251–64.', 'STANDARD', 'inquiries,', 'Oper', 'Hangzhou', 'vs', 'atmosphere', 'relationship', 'perceived', 'hand,', 'system', 'field', 'significant', 'planning', 'pastoralists', 'line', 'estimated', 'conditions', 'traditional', 'many', 'power', 'through', 'scenarios', 'percent', 'lower', 'functions', 'reservoir', 'cost', 'elements', 'included', 'greater', 'association', 'tool', 'correlation', 'objective', 'retailer', 'case', 'major', 'consumption', 'enterprise', 'main', 'suppliers,', 'average', 'coordination', 'retailers,', 'kinds', 'motor', 'aid', 'profit', 'optimal', 'fertilizer', '2011).', '2010).', '2008).'