# Data-Analysis

In [1]:
import pandas as pd
import glob
import warnings
from pathlib import Path
from tqdm.notebook import tqdm

import nltk

# Setup

Read data from processed csv files

In [2]:
DIR = Path("data")
split_dir = DIR / "split"
sub_dir = "monthly"

In [3]:
if sub_dir:
    csv_files = glob.glob(str(split_dir / sub_dir / "*.csv"))
else:
    csv_files = glob.glob(str(split_dir / "*/*.csv"))

In [4]:
dataframes = []
for file in csv_files:
    df = pd.read_csv(file)
    dataframes.append(df)

In [5]:
# used for training the Cade-Compass
df_all = pd.concat(dataframes, ignore_index=True)

In [6]:
print(f"Imported {len(dataframes)} files")

for i, df in enumerate(dataframes):
    print(f"Data from Period {i+1}: {df.date.min()} - {df.date.max()}")

Imported 11 files
Data from Period 1: 2023-03-01 00:00:00 - 2023-03-31 23:56:29
Data from Period 2: 2023-01-01 00:00:00 - 2023-01-31 23:58:10
Data from Period 3: 2023-02-01 00:00:00 - 2023-02-28 23:59:44
Data from Period 4: 2022-06-01 23:00:00 - 2022-06-30 23:52:16
Data from Period 5: 2022-09-01 00:05:10 - 2022-09-30 22:58:00
Data from Period 6: 2023-04-01 00:04:00 - 2023-04-28 07:44:34
Data from Period 7: 2022-10-01 23:01:00 - 2022-10-31 23:59:59
Data from Period 8: 2022-08-01 00:00:00 - 2022-08-31 23:54:58
Data from Period 9: 2022-12-01 00:00:00 - 2022-12-31 23:42:49
Data from Period 10: 2022-11-01 00:00:00 - 2022-11-30 23:58:00
Data from Period 11: 2022-07-01 00:00:00 - 2022-07-31 23:59:34


In [7]:
# convert date back to datetime object and sort them by date
for df in dataframes:
    df['date'] =  pd.to_datetime(df['date'])
    df.sort_values('date', inplace=True)

df_all['date'] =  pd.to_datetime(df_all['date'])
df_all.sort_values('date', inplace=True)

Check if any missing values are in the (processed) text column

In [8]:
df_all["text"].isna().sum()

0

If there are any but still amount to only an insignificant portion of the data, we delete them as to not cause problems with nltk's tokenizers.

In [9]:
df_all.dropna(subset=['text'], inplace=True)

# Sample-Engine

***CBS-News - The Hottest Topic of each Month:***

***2022***
* Roe / Wade / Abortion (June)
* Shinzo / Abe / Japan(July)
* Trump / Mar-a-Lago (August)
* Queen / Elizabeth / England (September)
* Elon / Musk / Twitter (October)
* Republicans / Red / Wave (November)
* Russia / Brittney / Griner / Prisoner (December)

***2023 (TODO)***
* xxx (January)
* xxx (February)
* xxx (March)
* xxx (April)

Timeframe: 2022-06-01 to 2023-04-28
source: [cbsnews](https://www.cbsnews.com/news/the-year-in-review-top-news-stories-of-2022-month-by-month/)

In [10]:
def sample_text(df, month = "06"):
    sample_stream_1 = df.sample(1)
    print("Stream 1")
    print(f"Date: {sample_stream_1['date'].iloc[0].date()}")
    print(f"Sentiment: {sample_stream_1['sentiment'].iloc[0]}")
    print("----------------")
    print(sample_stream_1["text"].iloc[0])

Notable Events:

* Teacherstrike

In [11]:
sample_text(df_all)

Stream 1
Date: 2023-01-23
Sentiment: Negatives
----------------
additionally gen z grew up in the digital age the first generation to only know a world with the internet however despite the infinite connectivity few members of gen z have the communication and interpersonal skills deemed necessary for successful careers much of this is due to the covid19 pandemic which forced gen z students to move their studies online hindering their abilities to foster formal and informal inperson interactions according to the workforce institute 34 of gen z americans blame educational barriers for their lack of skillsbased knowledge in the professional world 2 the pandemic also marked significant socioeconomic disparities from race to gender


### Create Corpora

In [12]:
corpus_all = df_all["text"].values.tolist()

corpora = [df["text"].values.tolist() for df in dataframes]

### Create Tokens

In [13]:
tokenizer = nltk.tokenize.TreebankWordTokenizer()

tokens_all = tokenizer.tokenize(" ".join(str(text) for text in tqdm(corpus_all)))

tokens = [tokenizer.tokenize(" ".join(str(text) for text in corpus)) for corpus in tqdm(corpora)]

  0%|          | 0/149998 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

### Create Vocabularies

In [14]:
# unique vocabulary of each class
vocabulary_all = set(tokens_all)

vocabularies = [set(tokens_split) for tokens_split in tqdm(tokens)]

  0%|          | 0/11 [00:00<?, ?it/s]

In [15]:
vocabulary_size_all = len(vocabulary_all)

vocabulary_sizes = [len(vocabulary) for vocabulary in vocabularies]

In [16]:
tokens[0].count("pelosis")

1

In [17]:
print(f"Vocabulary size: {vocabulary_size_all}")
for i, vocab_size in enumerate(vocabulary_sizes):
    print(f"Vocabulary size of split {i+1}: {vocab_size}")

Vocabulary size: 220582
Vocabulary size of split 1: 47890
Vocabulary size of split 2: 49578
Vocabulary size of split 3: 43668
Vocabulary size of split 4: 41172
Vocabulary size of split 5: 43809
Vocabulary size of split 6: 40843
Vocabulary size of split 7: 55486
Vocabulary size of split 8: 47271
Vocabulary size of split 9: 49526
Vocabulary size of split 10: 53728
Vocabulary size of split 11: 47945


# Word Embeddings

1. Static: Word2Vec (Gensim)

2. Temporal: TWEC/CADE

## 1. Word2Vec (static)

In [18]:
from gensim.models import Word2Vec

In [19]:
from nltk import word_tokenize

tokenized_sentences_all = [word_tokenize(item) for item in tqdm(corpus_all)]

  0%|          | 0/149998 [00:00<?, ?it/s]

In [20]:
w2v_model = Word2Vec(tokenized_sentences_all, min_count = 1, seed=1040)

In [21]:
test_word = "trump"
w2v_model.wv.most_similar(test_word)

[('donald', 0.7860180735588074),
 ('biden', 0.7639197111129761),
 ('trumps', 0.7628220319747925),
 ('obama', 0.6877856254577637),
 ('joe', 0.6639670729637146),
 ('thenpresident', 0.6630730628967285),
 ('putin', 0.6451755166053772),
 ('desantis', 0.6334975957870483),
 ('crist', 0.6087217330932617),
 ('bidens', 0.6021501421928406)]

In [23]:
w2v_model.save("model/static/word2vec.model")

## 2. Temporal: TWEC/CADE

***Temporal Word Embeddings with a Compass***

* [Source-Code](https://github.com/valedica/twec)

* [Paper](https://arxiv.org/abs/1906.02376)

* [Blogpost](https://fede-bianchi.medium.com/aligning-temporal-diachronic-word-embeddings-with-a-compass-732ab7427955)

Save the concatenated text to txt-files to make them usable for Cade

In [20]:
from cade.cade import CADE

In [21]:
cade_dir = DIR / "cade"
cade_split_dir = cade_dir / sub_dir
cade_split_dir.mkdir(parents=True, exist_ok=True)

file_paths = [(cade_split_dir / csv_file.split("/")[-1].split(".")[0]).with_suffix(".txt") for csv_file in csv_files]

file_paths_and_corpora = {
    cade_dir / 'compass.txt': corpus_all
}

for key, value in zip(file_paths, corpora):
    file_paths_and_corpora[key] = value

for file_path, corpus in file_paths_and_corpora.items():
    with open(file_path, 'w') as file:
        for item in corpus:
            file.write("%s\n" % item)

In [22]:
# only needed once for the installation & creation of a venv
'''%%capture
!pip install -U cade
!pip install git+https://github.com/vinid/gensim.git''';

Create & train the compass

This creates atemporal context and target word embeddings

In [23]:
warnings.filterwarnings("ignore")

aligner = CADE(size=30, min_count = 1)
aligner.train_compass(str((cade_dir / "compass").with_suffix(".txt")), overwrite=True);

Training the compass from scratch.


In [24]:
file_paths

[PosixPath('data/cade/monthly/01_Mar_to_01_Apr.txt'),
 PosixPath('data/cade/monthly/01_Jan_to_01_Feb.txt'),
 PosixPath('data/cade/monthly/01_Feb_to_01_Mar.txt'),
 PosixPath('data/cade/monthly/01_Jun_to_01_Jul.txt'),
 PosixPath('data/cade/monthly/01_Sep_to_01_Oct.txt'),
 PosixPath('data/cade/monthly/01_Apr_to_28_Apr.txt'),
 PosixPath('data/cade/monthly/01_Oct_to_01_Nov.txt'),
 PosixPath('data/cade/monthly/01_Aug_to_01_Sep.txt'),
 PosixPath('data/cade/monthly/01_Dec_to_01_Jan.txt'),
 PosixPath('data/cade/monthly/01_Nov_to_01_Dec.txt'),
 PosixPath('data/cade/monthly/01_Jul_to_01_Aug.txt')]

In [25]:
warnings.filterwarnings("ignore")

# train slices, they will be already aligned
slices = [aligner.train_slice(file_path, save=True) for file_path in file_paths] # list of gensim word2vec objects

Training embeddings: slice data/cade/monthly/01_Mar_to_01_Apr.txt.
Initializing embeddings from compass.
Training embeddings: slice data/cade/monthly/01_Jan_to_01_Feb.txt.
Initializing embeddings from compass.
Training embeddings: slice data/cade/monthly/01_Feb_to_01_Mar.txt.
Initializing embeddings from compass.
Training embeddings: slice data/cade/monthly/01_Jun_to_01_Jul.txt.
Initializing embeddings from compass.
Training embeddings: slice data/cade/monthly/01_Sep_to_01_Oct.txt.
Initializing embeddings from compass.
Training embeddings: slice data/cade/monthly/01_Apr_to_28_Apr.txt.
Initializing embeddings from compass.
Training embeddings: slice data/cade/monthly/01_Oct_to_01_Nov.txt.
Initializing embeddings from compass.
Training embeddings: slice data/cade/monthly/01_Aug_to_01_Sep.txt.
Initializing embeddings from compass.
Training embeddings: slice data/cade/monthly/01_Dec_to_01_Jan.txt.
Initializing embeddings from compass.
Training embeddings: slice data/cade/monthly/01_Nov_to_

Load the models

In [28]:
print("done")

done
