In [57]:
import pandas as pd
import glob
import warnings
from pprint import pprint
from pathlib import Path
from tqdm.notebook import tqdm

import nltk

# Imports

In [58]:
DATA_DIR = Path("../../data")
MODEL_DIR = Path("../../model")
input_dir = DATA_DIR / "processed" / "nyt-data"

In [59]:
file_paths = glob.glob(str(input_dir / "*.csv"))

In [60]:
file_paths

['../../data/processed/nyt-data/2000_data.csv',
 '../../data/processed/nyt-data/2001_data.csv',
 '../../data/processed/nyt-data/1997_data.csv',
 '../../data/processed/nyt-data/1996_data.csv',
 '../../data/processed/nyt-data/2016_data.csv',
 '../../data/processed/nyt-data/corpus_all.csv',
 '../../data/processed/nyt-data/1990_data.csv',
 '../../data/processed/nyt-data/1991_data.csv',
 '../../data/processed/nyt-data/2010_data.csv',
 '../../data/processed/nyt-data/2011_data.csv',
 '../../data/processed/nyt-data/2007_data.csv',
 '../../data/processed/nyt-data/2006_data.csv',
 '../../data/processed/nyt-data/1994_data.csv',
 '../../data/processed/nyt-data/1995_data.csv',
 '../../data/processed/nyt-data/2014_data.csv',
 '../../data/processed/nyt-data/2015_data.csv',
 '../../data/processed/nyt-data/2009_data.csv',
 '../../data/processed/nyt-data/2008_data.csv',
 '../../data/processed/nyt-data/2003_data.csv',
 '../../data/processed/nyt-data/2002_data.csv',
 '../../data/processed/nyt-data/2004_da

In [61]:
dataframes = []
for file in file_paths:
    df = pd.read_csv(file)
    dataframes.append(df)

In [62]:
# used for training the Cade-Compass
df_all = pd.concat(dataframes, ignore_index=True)

In [63]:
df_all.head()

Unnamed: 0,id,date,text,year
0,4fd232cd8eb7c8105d7c2d56,2000-01-01,island latest pristine cuisine restaurant fash...,2000.0
1,4fd210ee8eb7c8105d785b4c,2000-01-01,moscow dec 31 newborn political party register...,2000.0
2,4fd21fb88eb7c8105d7a2ee3,2000-01-01,washington dec 31 vice president gore denounce...,2000.0
3,4fd233c08eb7c8105d7c43b7,2000-01-01,harris doesnt killers face saw walking street ...,2000.0
4,4fd233c08eb7c8105d7c43af,2000-01-01,tax court judge ruled one nations prominent ta...,2000.0


In [64]:
print(f"Imported {len(dataframes)} files")

for i, df in enumerate(dataframes):
    print(f"Data from Period {i+1}: {df.date.min()} - {df.date.max()}")

Imported 28 files
Data from Period 1: 2000-01-01 - 2000-12-31
Data from Period 2: 2001-01-01 - 2001-12-31
Data from Period 3: 1997-01-01 - 1997-09-10
Data from Period 4: 1996-01-01 - 1996-12-31
Data from Period 5: 2016-01-01 - 2016-06-05
Data from Period 6: 1990-01-01 - 2016-06-05
Data from Period 7: 1990-01-01 - 1990-10-31
Data from Period 8: 1991-01-01 - 1991-12-31
Data from Period 9: 2010-01-01 - 2010-12-23
Data from Period 10: 2011-01-01 - 2011-12-15
Data from Period 11: 2007-01-01 - 2007-12-20
Data from Period 12: 2006-01-01 - 2006-12-22
Data from Period 13: 1994-01-01 - 1994-12-31
Data from Period 14: 1995-01-01 - 1995-12-31
Data from Period 15: 2014-01-01 - 2014-12-31
Data from Period 16: 2015-01-01 - 2015-12-31
Data from Period 17: 2009-01-01 - 2009-12-20
Data from Period 18: 2008-01-01 - 2008-12-21
Data from Period 19: 2003-01-01 - 2003-12-31
Data from Period 20: 2002-01-01 - 2002-12-31
Data from Period 21: 2004-01-01 - 2004-12-31
Data from Period 22: 2005-01-01 - 2005-12-29
D

In [65]:
# convert date back to datetime object and sort them by date
for df in dataframes:
    df['date'] =  pd.to_datetime(df['date'])
    df.sort_values('date', inplace=True)

df_all['date'] =  pd.to_datetime(df_all['date'])
df_all.sort_values('date', inplace=True)

Check if any missing values are in the (processed) text column

In [66]:
df_all["text"].isna().sum()

0

In [67]:
df_all = df_all.dropna(subset=["text"])

# Create Corpora

In [68]:
corpus_all = df_all["text"].values.tolist()

corpora = [df["text"].values.tolist() for df in dataframes]

In [69]:
len(corpus_all)

196720

In [70]:
len(corpora)

28

In [71]:
pprint(corpus_all[0:500])

['third national basketball association season completed knicks 207 record '
 'tops eastern conference league los angeles lakers 216 knicks still best '
 'conference record next 15 games jackson coach east team 40th nba allstar '
 'game feb 11 miami coaches teams best conference records games jan 28 coach '
 'allstar teams saturdays games knicks best record east game half ahead '
 'chicago bulls indiana pacers bulls pacers tied 199 central division lead '
 'detroit pistons atlanta hawks counted jackson bulls phil jackson firstyear '
 'coaches three rookie coaches honor coaching allstar game easy ed st louis '
 'hawks accomplished feat 1959 billy cunningham philadelphia 76ers 1978 pat '
 'riley lakers 1982 knicks streak unlike jacksons none three started season '
 'teams instead replacements dismissed coaches took andy phillips 10 games '
 'cunningham replaced gene 6 games riley succeeded paul 11 games knicks '
 'extended winning streak nine games saturday beating orlando magic ran '
 '

# Create Tokens

In [72]:
tokenizer = nltk.tokenize.TreebankWordTokenizer()

tokens_all = tokenizer.tokenize(" ".join(str(text) for text in tqdm(corpus_all)))

tokens = [tokenizer.tokenize(" ".join(str(text) for text in corpus)) for corpus in tqdm(corpora)]

  0%|          | 0/196720 [00:00<?, ?it/s]

  0%|          | 0/28 [00:00<?, ?it/s]

In [73]:
len(tokens_all)

99443436

# Create Vocabularies

In [74]:
vocabulary_all = set(tokens_all)

vocabularies = [set(tokens_split) for tokens_split in tqdm(tokens)]

  0%|          | 0/28 [00:00<?, ?it/s]

In [75]:
vocabulary_size_all = len(vocabulary_all)

vocabulary_sizes = [len(vocabulary) for vocabulary in vocabularies]

In [76]:
print(f"Vocabulary size: {vocabulary_size_all}")
for i, vocab_size in enumerate(vocabulary_sizes):
    print(f"Vocabulary size of split {i+1}: {vocab_size}")

Vocabulary size: 22062
Vocabulary size of split 1: 21756
Vocabulary size of split 2: 21780
Vocabulary size of split 3: 21357
Vocabulary size of split 4: 21602
Vocabulary size of split 5: 21334
Vocabulary size of split 6: 22062
Vocabulary size of split 7: 21238
Vocabulary size of split 8: 21387
Vocabulary size of split 9: 21798
Vocabulary size of split 10: 21820
Vocabulary size of split 11: 21931
Vocabulary size of split 12: 19104
Vocabulary size of split 13: 21503
Vocabulary size of split 14: 21520
Vocabulary size of split 15: 21856
Vocabulary size of split 16: 21853
Vocabulary size of split 17: 21810
Vocabulary size of split 18: 21935
Vocabulary size of split 19: 21836
Vocabulary size of split 20: 21822
Vocabulary size of split 21: 21906
Vocabulary size of split 22: 21906
Vocabulary size of split 23: 21709
Vocabulary size of split 24: 21699
Vocabulary size of split 25: 21439
Vocabulary size of split 26: 21437
Vocabulary size of split 27: 21870
Vocabulary size of split 28: 21902


# Word Embeddings

1. Static: Word2Vec (Gensim)

2. Temporal: TWEC/CADE

## 1. Word2Vec (static)

In [77]:
from gensim.models import Word2Vec

In [78]:
from nltk import word_tokenize

tokenized_sentences_all = [word_tokenize(item) for item in tqdm(corpus_all)]

  0%|          | 0/196720 [00:00<?, ?it/s]

In [79]:
w2v_model = Word2Vec(tokenized_sentences_all, seed=1040)

In [80]:
test_word = "apple"
w2v_model.wv.most_similar(test_word)

[('apples', 0.81557297706604),
 ('samsung', 0.7545889616012573),
 ('dell', 0.7151619791984558),
 ('android', 0.7053136825561523),
 ('blackberry', 0.6898148059844971),
 ('intel', 0.68919837474823),
 ('macintosh', 0.684700608253479),
 ('iphone', 0.6785283088684082),
 ('compaq', 0.6783748865127563),
 ('microsoft', 0.6761517524719238)]

In [81]:
output_model_dir_static = MODEL_DIR / "nyt-data" / "static"

output_model_dir_static.mkdir(parents=True, exist_ok=True)

model_static_file_path = output_model_dir_static / "w2v_model.model"

# w2v_model.save(str(model_static_file_path))

## 2. Temporal: TWEC/CADE

***Temporal Word Embeddings with a Compass***

* [Source-Code](https://github.com/valedica/twec)

* [Paper](https://arxiv.org/abs/1906.02376)

* [Blogpost](https://fede-bianchi.medium.com/aligning-temporal-diachronic-word-embeddings-with-a-compass-732ab7427955)

Save the concatenated text to txt-files to make them usable for Cade

In [82]:
import smart_open
print(smart_open.__version__)

1.10.0


In [83]:
from cade.cade import CADE

In [84]:
# Declare Paths
split_dir = DATA_DIR / "processed" / "nyt-data"
cade_dir = MODEL_DIR / "nyt-data" / "cade" / "data"

cade_dir.mkdir(parents=True, exist_ok=True)

# Create filepaths/names
file_paths = [(cade_dir / csv_file.split("/")[-1].split(".")[0]).with_suffix(".txt") for csv_file in file_paths]

file_paths_and_corpora = {
    cade_dir / 'compass.txt': corpus_all
}

for key, value in zip(file_paths, corpora):
    file_paths_and_corpora[key] = value

In [85]:
# Write them to memory
for file_path, corpus in file_paths_and_corpora.items():
    with open(file_path, 'w') as file:
        for item in corpus:
            file.write("%s\n" % item)

Create & train the compass

This creates atemporal context and target word embeddings

In [86]:
import gensim
print(gensim.__version__)

3.8.0


In [87]:
warnings.filterwarnings("ignore")

path_to_cade_models = Path("../../model/nyt-data/cade/model")
path_to_cade_models.mkdir(parents=True, exist_ok=True)

aligner = CADE(size=50, min_count = 5, ns = 5, opath=path_to_cade_models)
aligner.train_compass(str((cade_dir / "compass").with_suffix(".txt")), overwrite=True);

Training the compass from scratch.


In [88]:
warnings.filterwarnings("ignore")

# train slices, they will be already aligned and subsequently saved
slices = [aligner.train_slice(file_path, save=True) for file_path in tqdm(file_paths)] # list of gensim word2vec objects

  0%|          | 0/28 [00:00<?, ?it/s]

Training embeddings: slice ../../model/nyt-data/cade/data/2000_data.txt.
Initializing embeddings from compass.
Training embeddings: slice ../../model/nyt-data/cade/data/2001_data.txt.
Initializing embeddings from compass.
Training embeddings: slice ../../model/nyt-data/cade/data/1997_data.txt.
Initializing embeddings from compass.
Training embeddings: slice ../../model/nyt-data/cade/data/1996_data.txt.
Initializing embeddings from compass.
Training embeddings: slice ../../model/nyt-data/cade/data/2016_data.txt.
Initializing embeddings from compass.
Training embeddings: slice ../../model/nyt-data/cade/data/corpus_all.txt.
Initializing embeddings from compass.
Training embeddings: slice ../../model/nyt-data/cade/data/1990_data.txt.
Initializing embeddings from compass.
Training embeddings: slice ../../model/nyt-data/cade/data/1991_data.txt.
Initializing embeddings from compass.
Training embeddings: slice ../../model/nyt-data/cade/data/2010_data.txt.
Initializing embeddings from compass.


In [89]:
print("done")

done
