In [1]:
import pandas as pd
import re
import json
from tqdm.auto import tqdm
from collections import Counter
import spacy
import nltk
from nltk.corpus import stopwords
from transformers import pipeline

nltk.download('stopwords')
nlp = spacy.load("en_core_web_sm")
stopwords = nltk.corpus.stopwords.words('english')
stopwords = stopwords + ['mr', 'mrs', 'sir', 'ho', 'tion', 'presi', 'ident', 'ing',
                        'corea', 'nation', 'country', 'government', 'man',
                        'time', 'day', 'year', 'part',  'month', 'position', 'point',
                        'event', 'week', 'question', 'news', 'show', 'see',  'understand',
                        'add', 'learn', 'think', 'look', 'get', 'want',
                        'make', 'say', 'take', 'give', 'know', 'ask','answer',
                        'tho','con', 'sia', 'ese', 'rus', 'first', 'last', 'many',
                         'number', 'un', 'ed', 'st', 'go', 'call', 'still', 'well',
                         'also', 'never', 'yet', 'back', 'almost', 'ever', 'just',
                         'thus', 'even', 'later', 'finally', 'way', 'land',
                         'however', 'soon', 'ago', 'cessfully', 'al', 'dr', 'gen', 'li',
                         'oct', 'jim', 'luno', 'nnd', 'jap', 'hay', 'n', 'pfd', 'u25a0',
                         'sec', 'state', 'house']

classifier = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Device set to use cpu


This function allows to clean text from extra punctuation, spaces and numbers.

In [2]:
def clean_text(data):
  texts = []
  for d in data:
    text = d[1].replace('\n', ' ')
    state = d[2]
    cleaned_text = re.sub(r'(.)\1{2,}', '', text)
    cleaned_text = re.sub(r'[^a-zA-Z0-9\s.,\'-]', '', cleaned_text)
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    texts.append([cleaned_text, state])
  return texts

The following function extracts NERs from previously cleaned sentenses

In [3]:
def extract_ners(texts):
  ner_sents = []
  ner_ents = []

  for text in tqdm(texts):
    sents = re.split('[?.!]', text[0])
    state = text[1]
    for sent in sents:
      doc = nlp(sent)
      if doc.ents:
        entities = []
        for e in doc.ents:
          if e.label_ in ['PERSON', 'GPE', 'ORG', 'LOC', 'NORP']:
            entities.append((e.text, e.label_))
        ner_sents.append([sent, state])
        ner_ents.append(entities)
  return ner_sents, ner_ents

The following function extracts parts of speech from the sentence with corresponding NERs and counts the number of NERs of different types.

In [4]:
def find_dependencies(ner_sents, ner_ents):
  dependencies = {}
  for i in tqdm(range(len(ner_sents))):
    sent = ner_sents[i]
    ents = ner_ents[i]
    if 0 < len(ents) < 50:
      for ent in ents:
          text_ent = ent[0]
          if sent:
            sent_ner = sent[0].lower().strip().replace(text_ent, '')
            text_ent = text_ent.lower()
          if text_ent in dependencies:
            dependencies[text_ent]['counter'] += 1
          else:
            dependencies[text_ent] = {'counter': 1,
                                      'type': ent[1],
                                      'dependencies': {'nouns': [],
                                                      'verbs': [],
                                                      'adjectives': [],
                                                      'adverbs': [],
                                                      'propnouns': []}}
          if sent_ner:
            doc = nlp(sent_ner)
            for d in doc:
              text_d = d.lemma_.lower()
              pos_d = d.pos_
              if pos_d == 'NOUN':
                  dependencies[text_ent]['dependencies']['nouns'].append(text_d)
              if pos_d == 'VERB':
                dependencies[text_ent]['dependencies']['verbs'].append(text_d)
              if pos_d == 'PROPN':
                dependencies[text_ent]['dependencies']['propnouns'].append(text_d)
              if pos_d == 'ADJ':
                dependencies[text_ent]['dependencies']['adjectives'].append(text_d)
              if pos_d == 'ADV':
                dependencies[text_ent]['dependencies']['adverbs'].append(text_d)
  return dependencies

The following function creates a Counter for lists of different types of speech.

In [5]:
def sort_pos(data, word, pos_tag, stopwords):
  pos = Counter(data[pos_tag])
  pos = [(n, pos[n]) for n in pos if pos[n] > 3 and n != word]
  pos = [n for n in pos if n[0] not in stopwords and len(n[0]) > 1]
  pos_final = sorted(pos, key=lambda x: x[1], reverse=True)
  return pos_final

The following function sorts dicts of NERs in descending order and filters the lists of exttracted parts of speech.

In [6]:
def get_count(deps, stopwords):
  dependencies = []
  for d in deps:
    dependencies.append((d, deps[d]['counter'], deps[d]['type'], deps[d]['dependencies']))
  filtered_dependencies_desc = sorted(dependencies, key=lambda x: x[1], reverse=True)

  people = []
  gpe = []
  locations = []
  organisations = []
  norps = []

  for s in filtered_dependencies_desc:
    if s[1] > 20 and s[0] not in stopwords:
      pos_type = s[2]
      nouns_final = sort_pos(s[3], s[0], 'nouns', stopwords)
      verbs_final = sort_pos(s[3], s[0], 'verbs', stopwords)
      propnouns_final = sort_pos(s[3], s[0], 'propnouns', stopwords)
      adjectives_final = sort_pos(s[3], s[0], 'adjectives', stopwords)
      adverbs_final = sort_pos(s[3], s[0], 'adverbs', stopwords)
      if pos_type == 'LOC':
        locations.append((s[0], s[1], nouns_final, verbs_final, propnouns_final,
                          adjectives_final, adverbs_final))
      if pos_type == 'GPE':
        gpe.append((s[0], s[1], nouns_final, verbs_final, propnouns_final,
                          adjectives_final, adverbs_final))
      if pos_type == 'PERSON':
        people.append((s[0], s[1], nouns_final[:20], nouns_final, verbs_final, propnouns_final,
                          adjectives_final, adverbs_final))
      if pos_type == 'ORG':
        organisations.append((s[0], s[1], nouns_final, verbs_final, propnouns_final,
                          adjectives_final, adverbs_final))
      if pos_type == 'NORP':
        norps.append((s[0], s[1], nouns_final, verbs_final, propnouns_final,
                          adjectives_final, adverbs_final))
  return people, gpe, locations, organisations,norps

The following function classifies sentiments of sentences containing occurences of Japan and Russia and counts them.

In [7]:
def get_sentiments(clean_texts):
  japan_stats = {'positive': 0,
                        'negative': 0}
  russia_stats = {'positive': 0,
                        'negative': 0}
  for text in tqdm(clean_texts):
    sents = re.split('[?.!]', text[0])
    for s in sents:
      if len(s) < 500:
        if 'Japan' in s and 'Russia' not in s:
          tag = classifier(s)[0]['label']
          if tag == 'NEGATIVE':
            japan_stats['negative'] += 1
          if tag == 'POSITIVE':
            japan_stats['positive'] += 1
        if 'Russia' in s and 'Japan' not in s:
          tag = classifier(s)[0]['label']
          if tag == 'NEGATIVE':
            russia_stats['negative'] += 1
          if tag == 'POSITIVE':
            russia_stats['positive'] += 1
  return japan_stats, russia_stats

Here is a sample of working code on material of first 50 newspapers, published in january of 1904

In [8]:
with open('sample.json', 'r', encoding='utf-8') as json_file:
    sample = json.load(json_file)

In [9]:
clean_text = clean_text(sample)
ner_sents, ner_ents = extract_ners(clean_text)
sentiments = get_sentiments(clean_text)
dependencies = find_dependencies(ner_sents, ner_ents)
final_data_sample = get_count(dependencies, stopwords)

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/8204 [00:00<?, ?it/s]

In [10]:
japan_sent = sentiments[0]
japan_sent

{'positive': 357, 'negative': 484}

In [11]:
russia_sent = sentiments[1]
russia_sent

{'positive': 152, 'negative': 287}

These are the examples of the final data: word, word frequency, most frequent nouns, verbs, proper nouns, adjectives and adverbs.

In [12]:
result_1 = final_data_sample[1]

In [13]:
word = result_1[0][0]
word

'japan'

In [14]:
count = result_1[0][1]
count

1083

In [15]:
nouns = result_1[0][2][:20]
nouns

[('war', 122),
 ('right', 85),
 ('interest', 64),
 ('ship', 54),
 ('reply', 52),
 ('treaty', 43),
 ('power', 39),
 ('port', 39),
 ('negotiation', 37),
 ('sea', 36),
 ('concession', 30),
 ('proposal', 30),
 ('independence', 28),
 ('army', 28),
 ('commerce', 28),
 ('conflict', 26),
 ('peace', 26),
 ('cruiser', 26),
 ('condition', 25),
 ('troop', 24)]

In [16]:
verbs = result_1[0][3][:20]
verbs

[('recognize', 43),
 ('accept', 32),
 ('send', 31),
 ('receive', 31),
 ('leave', 26),
 ('reach', 25),
 ('force', 24),
 ('regard', 24),
 ('accord', 23),
 ('remain', 22),
 ('follow', 20),
 ('fight', 19),
 ('prevent', 19),
 ('acquire', 19),
 ('begin', 17),
 ('lead', 17),
 ('refuse', 16),
 ('come', 16),
 ('hold', 16),
 ('become', 16)]

In [17]:
propnouns = result_1[0][4][:20]
propnouns

[('russia', 454),
 ('korea', 134),
 ('china', 118),
 ('manchuria', 92),
 ('united', 61),
 ('states', 59),
 ('britain', 35),
 ('japanese', 24),
 ('baron', 22),
 ('churia', 14),
 ('minister', 14),
 ('france', 13),
 ('seoul', 12),
 ('east', 12),
 ('germany', 12),
 ('october', 11),
 ('de', 10),
 ('arthur', 10),
 ('london', 10),
 ('niuchwang', 9)]

In [18]:
adjectives = result_1[0][5][:20]
adjectives

[('russian', 101),
 ('japanese', 94),
 ('great', 75),
 ('commercial', 44),
 ('good', 32),
 ('special', 31),
 ('high', 27),
 ('naval', 25),
 ('little', 21),
 ('new', 20),
 ('willing', 19),
 ('third', 19),
 ('neutral', 18),
 ('foreign', 18),
 ('present', 17),
 ('certain', 17),
 ('territorial', 17),
 ('eastern', 16),
 ('considerable', 16),
 ('definite', 16)]

In [19]:
adverbs = result_1[0][6][:20]
adverbs

[('far', 39),
 ('alone', 15),
 ('immediately', 13),
 ('mutually', 13),
 ('reciprocally', 10),
 ('officially', 10),
 ('practically', 9),
 ('probably', 9),
 ('subsequently', 8),
 ('nearly', 7),
 ('possibly', 7),
 ('long', 7),
 ('fully', 6),
 ('merely', 6),
 ('highly', 6),
 ('symmetrically', 5),
 ('fairly', 5),
 ('rather', 5),
 ('simultaneously', 5),
 ('thereby', 5)]