# Data-Analysis

In [27]:
import pandas as pd
import glob
import warnings
from pathlib import Path
from tqdm import tqdm

import nltk

# Setup

Read data from processed csv files

In [28]:
current_split_dir = "colorado_springs"

DIR = Path("data/")
input_dir = DIR / "split" / current_split_dir
csv_files = glob.glob(str(input_dir / "*.csv"))

dataframes = []
for file in csv_files:
    df = pd.read_csv(file)
    dataframes.append(df)

In [29]:
df_all = pd.concat(dataframes, ignore_index=True)

In [30]:
print(f"Imported {len(dataframes)} files")

counter = 1
for df in dataframes:
    print(f"Data from Period {counter}: {df.date.min()} - {df.date.max()}")
    counter += 1

Imported 2 files
Data from Period 1: 2022-11-18 00:00:00 - 2023-04-28 07:44:34
Data from Period 2: 2022-06-01 23:00:00 - 2022-11-17 23:59:17


In [31]:
# convert date back to datetime object and sort them by date
for df in dataframes:
    df['date'] =  pd.to_datetime(df['date'])
    df.sort_values('date', inplace=True)

df_all['date'] =  pd.to_datetime(df_all['date'])
df_all.sort_values('date', inplace=True)

Check if any missing values are in the (processed) text column

In [32]:
df_all["text"].isna().sum()

109

If there are any but still amount to only an insignificant portion of the data, we delete them as to not cause problems with nltk's tokenizers.

In [33]:
df_all.dropna(subset=['text'], inplace=True)

# Sample-Engine

***CBS-News - The Hottest Topic of each Month:***

***2022***
* Roe / Wade / Abortion (June)
* Shinzo / Abe / Japan(July)
* Trump / Mar-a-Lago (August)
* Queen / Elizabeth / England (September)
* Elon / Musk / Twitter (October)
* Republicans / Red / Wave (November)
* Russia / Brittney / Griner / Prisoner (December)

***2023 (TODO)***
* xxx (January)
* xxx (February)
* xxx (March)
* xxx (April)

Timeframe: 2022-06-01 to 2023-04-28
source: [cbsnews](https://www.cbsnews.com/news/the-year-in-review-top-news-stories-of-2022-month-by-month/)

### Create Corpora

In [34]:
corpus_all = df_all["text"].values.tolist()

corpora = [df["text"].values.tolist() for df in dataframes]

### Create Tokens

In [35]:
tokenizer = nltk.tokenize.TreebankWordTokenizer()

tokens_all = tokenizer.tokenize(" ".join(str(text) for text in tqdm(corpus_all)))

tokens = [tokenizer.tokenize(" ".join(str(text) for text in corpus)) for corpus in tqdm(corpora)]

100%|██████████| 149891/149891 [00:00<00:00, 4847736.63it/s]
100%|██████████| 2/2 [00:08<00:00,  4.04s/it]


### Create Vocabularies

In [36]:
# unique vocabulary of each class
vocabulary_all = set(tokens_all)

vocabularies = [set(tokens_split) for tokens_split in tqdm(tokens)]

100%|██████████| 2/2 [00:00<00:00,  7.75it/s]


In [37]:
vocabulary_size_all = len(vocabulary_all)

vocabulary_sizes = [len(vocabulary) for vocabulary in vocabularies]

In [38]:
print(f"Vocabulary size: {vocabulary_size_all}")
counter = 1
for vocab_size in vocabulary_sizes:
    print(f"Vocabulary size of split {counter}: {vocab_size}")
    counter += 1

Vocabulary size: 187589
Vocabulary size of split 1: 110841
Vocabulary size of split 2: 118035


# Word Embeddings

1. Static: Word2Vec (Gensim)

2. Temporal: TWEC/CADE

## 1. Word2Vec (static)

In [39]:
from gensim.models import Word2Vec

In [40]:
from nltk import word_tokenize

tokenized_sentences_all = [word_tokenize(item) for item in tqdm(corpus_all)]

100%|██████████| 149891/149891 [00:13<00:00, 10883.08it/s]


In [41]:
w2v_model = Word2Vec(tokenized_sentences_all, min_count = 1)

In [42]:
test_word = "trump"
w2v_model.wv.most_similar(test_word)

[('biden', 0.742926299571991),
 ('donald', 0.7195136547088623),
 ('obama', 0.6923248171806335),
 ('egogener', 0.6524394750595093),
 ('impeach', 0.6318155527114868),
 ('presidenti', 0.6065165996551514),
 ('thenpresid', 0.5993049144744873),
 ('pompeo', 0.591389000415802),
 ('gop', 0.5850652456283569),
 ('crist', 0.5757563710212708)]

In [43]:
w2v_model.save("model/word2vec.model")

## 2. Temporal: TWEC/CADE

***Temporal Word Embeddings with a Compass***

* [Source-Code](https://github.com/valedica/twec)

* [Paper](https://arxiv.org/abs/1906.02376)

* [Blogpost](https://fede-bianchi.medium.com/aligning-temporal-diachronic-word-embeddings-with-a-compass-732ab7427955)

Save the concatenated text to txt-files to make them usable for Cade

In [44]:
from cade.cade import CADE

In [45]:
cade_dir = DIR / "cade"
cade_split_dir = cade_dir / current_split_dir
cade_split_dir.mkdir(parents=True, exist_ok=True)

file_paths = [(cade_split_dir / csv_file.split("/")[-1].split(".")[0]).with_suffix(".txt") for csv_file in csv_files]

file_paths_and_corpora = {
    cade_dir / 'compass.txt': corpus_all
}

for key, value in zip(file_paths, corpora):
    file_paths_and_corpora[key] = value

for file_path, corpus in file_paths_and_corpora.items():
    with open(file_path, 'w') as file:
        for item in corpus:
            file.write("%s\n" % item)

In [46]:
# only needed once for the installation & creation of a venv
''''%%capture
!pip install -U cade
!pip install git+https://github.com/vinid/gensim.git''';

Create & train the compass

This creates atemporal context and target word embeddings

In [47]:
warnings.filterwarnings("ignore")

aligner = CADE(size=30)
aligner.train_compass(str((cade_dir / "compass").with_suffix(".txt")), overwrite=True);

Training the compass from scratch.


In [48]:
file_paths

[PosixPath('data/cade/colorado_springs/18_Nov_to_28_Apr.txt'),
 PosixPath('data/cade/colorado_springs/01_Jun_to_18_Nov.txt')]

In [49]:
warnings.filterwarnings("ignore")

# train slices, they will be already aligned
slices = [aligner.train_slice(file_path, save=True) for file_path in file_paths] # list of gensim word2vec objects

Training embeddings: slice data/cade/colorado_springs/18_Nov_to_28_Apr.txt.
Initializing embeddings from compass.
Training embeddings: slice data/cade/colorado_springs/01_Jun_to_18_Nov.txt.
Initializing embeddings from compass.


Load the models

In [50]:
'''from gensim.models.word2vec import Word2Vec

model_dir = Path("model/")
model_files = glob.glob(str(model_dir / "*.model"))

models = [Word2Vec.load(model_file) for model_file in model_files]''';

In [51]:
'''
model_one = models[0]
model_two = models[1]
''';

In [52]:
counter = 1
for df in dataframes:
    print(f"Period {counter}: {df.date.min()} - {df.date.max()}")
    counter += 1

Data from Period 1: 2022-11-18 00:00:00 - 2023-04-28 07:44:34
Data from Period 2: 2022-06-01 23:00:00 - 2022-11-17 23:59:17
