# Data-Analysis

In [1]:
import pandas as pd
import glob
import warnings
from pathlib import Path
from tqdm import tqdm

import nltk

# Setup

Read data from processed csv files

In [2]:
DIR = Path("data")
split_dir = DIR / "split"
csv_files = glob.glob(str(split_dir / "*/*.csv"))

dataframes = []
for file in csv_files:
    df = pd.read_csv(file)
    dataframes.append(df)

In [3]:
# used for training the Cade-Compass
df_all = pd.concat(dataframes, ignore_index=True)

In [4]:
print(f"Imported {len(dataframes)} files")

counter = 1
for df in dataframes:
    print(f"Data from Period {counter}: {df.date.min()} - {df.date.max()}")
    counter += 1

Imported 8 files
Data from Period 1: 2022-06-01 23:00:00 - 2022-09-30 22:58:00
Data from Period 2: 2022-10-01 23:01:00 - 2023-04-28 07:44:34
Data from Period 3: 2022-10-07 00:00:00 - 2023-04-28 07:44:34
Data from Period 4: 2022-06-01 23:00:00 - 2022-10-06 23:56:45
Data from Period 5: 2022-06-01 23:00:00 - 2022-10-25 23:52:11
Data from Period 6: 2022-10-26 00:00:00 - 2023-04-28 07:44:34
Data from Period 7: 2022-11-18 00:00:00 - 2023-04-28 07:44:34
Data from Period 8: 2022-06-01 23:00:00 - 2022-11-17 23:59:17


In [5]:
# convert date back to datetime object and sort them by date
for df in dataframes:
    df['date'] =  pd.to_datetime(df['date'])
    df.sort_values('date', inplace=True)

df_all['date'] =  pd.to_datetime(df_all['date'])
df_all.sort_values('date', inplace=True)

Check if any missing values are in the (processed) text column

In [6]:
df_all["text"].isna().sum()

0

If there are any but still amount to only an insignificant portion of the data, we delete them as to not cause problems with nltk's tokenizers.

In [7]:
df_all.dropna(subset=['text'], inplace=True)

# Sample-Engine

***CBS-News - The Hottest Topic of each Month:***

***2022***
* Roe / Wade / Abortion (June)
* Shinzo / Abe / Japan(July)
* Trump / Mar-a-Lago (August)
* Queen / Elizabeth / England (September)
* Elon / Musk / Twitter (October)
* Republicans / Red / Wave (November)
* Russia / Brittney / Griner / Prisoner (December)

***2023 (TODO)***
* xxx (January)
* xxx (February)
* xxx (March)
* xxx (April)

Timeframe: 2022-06-01 to 2023-04-28
source: [cbsnews](https://www.cbsnews.com/news/the-year-in-review-top-news-stories-of-2022-month-by-month/)

### Create Corpora

In [8]:
corpus_all = df_all["text"].values.tolist()

corpora = [df["text"].values.tolist() for df in dataframes]

### Create Tokens

In [9]:
tokenizer = nltk.tokenize.TreebankWordTokenizer()

tokens_all = tokenizer.tokenize(" ".join(str(text) for text in tqdm(corpus_all)))

tokens = [tokenizer.tokenize(" ".join(str(text) for text in corpus)) for corpus in tqdm(corpora)]

100%|██████████| 599564/599564 [00:00<00:00, 4230558.02it/s]
100%|██████████| 8/8 [00:32<00:00,  4.10s/it]


### Create Vocabularies

In [10]:
# unique vocabulary of each class
vocabulary_all = set(tokens_all)

vocabularies = [set(tokens_split) for tokens_split in tqdm(tokens)]

100%|██████████| 8/8 [00:01<00:00,  7.37it/s]


In [11]:
vocabulary_size_all = len(vocabulary_all)

vocabulary_sizes = [len(vocabulary) for vocabulary in vocabularies]

In [12]:
print(f"Vocabulary size: {vocabulary_size_all}")
counter = 1
for vocab_size in vocabulary_sizes:
    print(f"Vocabulary size of split {counter}: {vocab_size}")
    counter += 1

Vocabulary size: 220439
Vocabulary size of split 1: 106720
Vocabulary size of split 2: 167134
Vocabulary size of split 3: 164093
Vocabulary size of split 4: 110498
Vocabulary size of split 5: 125418
Vocabulary size of split 6: 151338
Vocabulary size of split 7: 134497
Vocabulary size of split 8: 142440


# Word Embeddings

1. Static: Word2Vec (Gensim)

2. Temporal: TWEC/CADE

## 1. Word2Vec (static)

In [13]:
from gensim.models import Word2Vec

In [14]:
from nltk import word_tokenize

tokenized_sentences_all = [word_tokenize(item) for item in tqdm(corpus_all)]

100%|██████████| 599564/599564 [01:00<00:00, 9874.25it/s] 


In [15]:
w2v_model = Word2Vec(tokenized_sentences_all, min_count = 30, seed=1040)

In [16]:
test_word = "trump"
w2v_model.wv.most_similar(test_word)

[('trumps', 0.8276152014732361),
 ('donald', 0.6924834251403809),
 ('biden', 0.6235580444335938),
 ('bidens', 0.6000710725784302),
 ('obama', 0.5897787809371948),
 ('presidency', 0.5470191240310669),
 ('desantis', 0.5289232730865479),
 ('impeachment', 0.5264337658882141),
 ('marfia', 0.5250229239463806),
 ('putin', 0.5220085382461548)]

In [17]:
w2v_model.save("model/word2vec.model")



## 2. Temporal: TWEC/CADE

***Temporal Word Embeddings with a Compass***

* [Source-Code](https://github.com/valedica/twec)

* [Paper](https://arxiv.org/abs/1906.02376)

* [Blogpost](https://fede-bianchi.medium.com/aligning-temporal-diachronic-word-embeddings-with-a-compass-732ab7427955)

Save the concatenated text to txt-files to make them usable for Cade

In [18]:
from cade.cade import CADE

In [19]:
cade_dir = DIR / "cade"
cade_split_dir = cade_dir
cade_split_dir.mkdir(parents=True, exist_ok=True)

file_paths = [(cade_split_dir / csv_file.split("/")[-1].split(".")[0]).with_suffix(".txt") for csv_file in csv_files]

file_paths_and_corpora = {
    cade_dir / 'compass.txt': corpus_all
}

for key, value in zip(file_paths, corpora):
    file_paths_and_corpora[key] = value

for file_path, corpus in file_paths_and_corpora.items():
    with open(file_path, 'w') as file:
        for item in corpus:
            file.write("%s\n" % item)

In [20]:
# only needed once for the installation & creation of a venv
''''%%capture
!pip install -U cade
!pip install git+https://github.com/vinid/gensim.git''';

Create & train the compass

This creates atemporal context and target word embeddings

In [21]:
warnings.filterwarnings("ignore")

aligner = CADE(size=30, min_count = 30)
aligner.train_compass(str((cade_dir / "compass").with_suffix(".txt")), overwrite=True);

Training the compass from scratch.


In [22]:
file_paths

[PosixPath('data/cade/01_Jun_to_01_Oct.txt'),
 PosixPath('data/cade/01_Oct_to_28_Apr.txt'),
 PosixPath('data/cade/07_Oct_to_28_Apr.txt'),
 PosixPath('data/cade/01_Jun_to_07_Oct.txt'),
 PosixPath('data/cade/01_Jun_to_26_Oct.txt'),
 PosixPath('data/cade/26_Oct_to_28_Apr.txt'),
 PosixPath('data/cade/18_Nov_to_28_Apr.txt'),
 PosixPath('data/cade/01_Jun_to_18_Nov.txt')]

In [23]:
warnings.filterwarnings("ignore")

# train slices, they will be already aligned
slices = [aligner.train_slice(file_path, save=True) for file_path in file_paths] # list of gensim word2vec objects

Training embeddings: slice data/cade/01_Jun_to_01_Oct.txt.
Initializing embeddings from compass.
Training embeddings: slice data/cade/01_Oct_to_28_Apr.txt.
Initializing embeddings from compass.
Training embeddings: slice data/cade/07_Oct_to_28_Apr.txt.
Initializing embeddings from compass.
Training embeddings: slice data/cade/01_Jun_to_07_Oct.txt.
Initializing embeddings from compass.
Training embeddings: slice data/cade/01_Jun_to_26_Oct.txt.
Initializing embeddings from compass.
Training embeddings: slice data/cade/26_Oct_to_28_Apr.txt.
Initializing embeddings from compass.
Training embeddings: slice data/cade/18_Nov_to_28_Apr.txt.
Initializing embeddings from compass.
Training embeddings: slice data/cade/01_Jun_to_18_Nov.txt.
Initializing embeddings from compass.


Load the models

In [24]:
'''from gensim.models.word2vec import Word2Vec

model_dir = Path("model/")
model_files = glob.glob(str(model_dir / "*.model"))

models = [Word2Vec.load(model_file) for model_file in model_files]''';

In [25]:
'''
model_one = models[0]
model_two = models[1]
''';

In [26]:
print("done")

done
