# Data-Analysis

In [None]:
import pandas as pd
import glob
import warnings
from pathlib import Path
from tqdm.notebook import tqdm

import nltk

# Setup

Read data from processed csv files

In [None]:
DIR = Path("data")
split_dir = DIR / "split"
sub_dir = "monthly"

In [None]:
if sub_dir:
    csv_files = glob.glob(str(split_dir / sub_dir / "*.csv"))
else:
    csv_files = glob.glob(str(split_dir / "*/*.csv"))

In [None]:
dataframes = []
for file in csv_files:
    df = pd.read_csv(file)
    dataframes.append(df)

In [None]:
# used for training the Cade-Compass
df_all = pd.concat(dataframes, ignore_index=True)

In [None]:
print(f"Imported {len(dataframes)} files")

for i, df in enumerate(dataframes):
    print(f"Data from Period {i+1}: {df.date.min()} - {df.date.max()}")

In [None]:
# convert date back to datetime object and sort them by date
for df in dataframes:
    df['date'] =  pd.to_datetime(df['date'])
    df.sort_values('date', inplace=True)

df_all['date'] =  pd.to_datetime(df_all['date'])
df_all.sort_values('date', inplace=True)

Check if any missing values are in the (processed) text column

In [None]:
df_all["text"].isna().sum()

If there are any but still amount to only an insignificant portion of the data, we delete them as to not cause problems with nltk's tokenizers.

In [None]:
df_all.dropna(subset=['text'], inplace=True)

# Sample-Engine

***CBS-News - The Hottest Topic of each Month:***

***2022***
* Roe / Wade / Abortion (June)
* Shinzo / Abe / Japan(July)
* Trump / Mar-a-Lago (August)
* Queen / Elizabeth / England (September)
* Elon / Musk / Twitter (October)
* Republicans / Red / Wave (November)
* Russia / Brittney / Griner / Prisoner (December)

***2023 (TODO)***
* xxx (January)
* xxx (February)
* xxx (March)
* xxx (April)

Timeframe: 2022-06-01 to 2023-04-28
source: [cbsnews](https://www.cbsnews.com/news/the-year-in-review-top-news-stories-of-2022-month-by-month/)

In [None]:
def sample_text(df, month = "06"):
    sample_stream_1 = df.sample(1)
    print("Stream 1")
    print(f"Date: {sample_stream_1['date'].iloc[0].date()}")
    print(f"Sentiment: {sample_stream_1['sentiment'].iloc[0]}")
    print("----------------")
    print(sample_stream_1["text"].iloc[0])

Notable Events:

* Teacherstrike

In [None]:
sample_text(df_all)

### Create Corpora

In [None]:
corpus_all = df_all["text"].values.tolist()

corpora = [df["text"].values.tolist() for df in dataframes]

### Create Tokens

In [None]:
tokenizer = nltk.tokenize.TreebankWordTokenizer()

tokens_all = tokenizer.tokenize(" ".join(str(text) for text in tqdm(corpus_all)))

tokens = [tokenizer.tokenize(" ".join(str(text) for text in corpus)) for corpus in tqdm(corpora)]

### Create Vocabularies

In [None]:
# unique vocabulary of each class
vocabulary_all = set(tokens_all)

vocabularies = [set(tokens_split) for tokens_split in tqdm(tokens)]

In [None]:
vocabulary_size_all = len(vocabulary_all)

vocabulary_sizes = [len(vocabulary) for vocabulary in vocabularies]

In [None]:
tokens[0].count("pelosis")

In [None]:
print(f"Vocabulary size: {vocabulary_size_all}")
for i, vocab_size in enumerate(vocabulary_sizes):
    print(f"Vocabulary size of split {i+1}: {vocab_size}")

# Word Embeddings

1. Static: Word2Vec (Gensim)

2. Temporal: TWEC/CADE

## 1. Word2Vec (static)

In [None]:
from gensim.models import Word2Vec

In [None]:
from nltk import word_tokenize

tokenized_sentences_all = [word_tokenize(item) for item in tqdm(corpus_all)]

In [None]:
w2v_model = Word2Vec(tokenized_sentences_all, min_count = 1, seed=1040)

In [None]:
test_word = "trump"
w2v_model.wv.most_similar(test_word)

In [None]:
w2v_model.save("model/static/word2vec.model")

## 2. Temporal: TWEC/CADE

***Temporal Word Embeddings with a Compass***

* [Source-Code](https://github.com/valedica/twec)

* [Paper](https://arxiv.org/abs/1906.02376)

* [Blogpost](https://fede-bianchi.medium.com/aligning-temporal-diachronic-word-embeddings-with-a-compass-732ab7427955)

Save the concatenated text to txt-files to make them usable for Cade

In [None]:
from cade.cade import CADE

In [None]:
cade_dir = DIR / "cade"
cade_split_dir = cade_dir / sub_dir
cade_split_dir.mkdir(parents=True, exist_ok=True)

file_paths = [(cade_split_dir / csv_file.split("/")[-1].split(".")[0]).with_suffix(".txt") for csv_file in csv_files]

file_paths_and_corpora = {
    cade_dir / 'compass.txt': corpus_all
}

for key, value in zip(file_paths, corpora):
    file_paths_and_corpora[key] = value

for file_path, corpus in file_paths_and_corpora.items():
    with open(file_path, 'w') as file:
        for item in corpus:
            file.write("%s\n" % item)

In [None]:
# only needed once for the installation & creation of a venv
'''%%capture
!pip install -U cade
!pip install git+https://github.com/vinid/gensim.git''';

Create & train the compass

This creates atemporal context and target word embeddings

In [None]:
warnings.filterwarnings("ignore")

aligner = CADE(size=30, min_count = 1)
aligner.train_compass(str((cade_dir / "compass").with_suffix(".txt")), overwrite=True);

In [None]:
file_paths

In [None]:
warnings.filterwarnings("ignore")

# train slices, they will be already aligned
slices = [aligner.train_slice(file_path, save=True) for file_path in file_paths] # list of gensim word2vec objects

Load the models

In [None]:
print("done")