# Data-Analysis

In [3]:
import pandas as pd
import glob
import warnings
from pathlib import Path
from tqdm.notebook import tqdm

import nltk

# Setup

Read data from processed csv files

In [4]:
DIR = Path("data")
split_dir = DIR / "split"
sub_dir = "quarter"

In [5]:
if sub_dir:
    csv_files = glob.glob(str(split_dir / sub_dir / "*.csv"))
else:
    csv_files = glob.glob(str(split_dir / "*/*.csv"))

In [6]:
dataframes = []
for file in csv_files:
    df = pd.read_csv(file)
    dataframes.append(df)

In [7]:
# used for training the Cade-Compass
df_all = pd.concat(dataframes, ignore_index=True)

In [8]:
print(f"Imported {len(dataframes)} files")

for i, df in enumerate(dataframes):
    print(f"Data from Period {i+1}: {df.date.min()} - {df.date.max()}")

Imported 4 files
Data from Period 1: 2022-06-01 23:00:00 - 2022-08-31 23:54:58
Data from Period 2: 2022-09-01 00:05:10 - 2022-11-30 23:58:00
Data from Period 3: 2022-12-01 00:00:00 - 2023-02-28 23:59:44
Data from Period 4: 2023-03-01 00:00:00 - 2023-04-28 07:44:34


In [9]:
# convert date back to datetime object and sort them by date
for df in dataframes:
    df['date'] =  pd.to_datetime(df['date'])
    df.sort_values('date', inplace=True)

df_all['date'] =  pd.to_datetime(df_all['date'])
df_all.sort_values('date', inplace=True)

Check if any missing values are in the (processed) text column

In [10]:
df_all["text"].isna().sum()

0

If there are any but still amount to only an insignificant portion of the data, we delete them as to not cause problems with nltk's tokenizers.

In [11]:
df_all.dropna(subset=['text'], inplace=True)

# Sample-Engine

***CBS-News - The Hottest Topic of each Month:***

***2022***
* Roe / Wade / Abortion (June)
* Shinzo / Abe / Japan(July)
* Trump / Mar-a-Lago (August)
* Queen / Elizabeth / England (September)
* Elon / Musk / Twitter (October)
* Republicans / Red / Wave (November)
* Russia / Brittney / Griner / Prisoner (December)

***2023 (TODO)***
* xxx (January)
* xxx (February)
* xxx (March)
* xxx (April)

Timeframe: 2022-06-01 to 2023-04-28
source: [cbsnews](https://www.cbsnews.com/news/the-year-in-review-top-news-stories-of-2022-month-by-month/)

In [12]:
def sample_text(df, month = "06"):
    sample_stream_1 = df.sample(1)
    print("Stream 1")
    print(f"Date: {sample_stream_1['date'].iloc[0].date()}")
    print(f"Sentiment: {sample_stream_1['sentiment'].iloc[0]}")
    print("----------------")
    print(sample_stream_1["text"].iloc[0])

Notable Events:

* Teacherstrike
* libsoftiktok
* Paul Vallas
* Transgender

In [13]:
sample_text(df_all)

Stream 1
Date: 2022-06-14
Sentiment: Neutrals
----------------
 joanntrejo impacting faculty diversity  sdiracda thrilled to announce 6 ucsandiego iracda fellows obtain tenuretrack faculty positions the new stewards of diversity inclusivity trainers of next gen stem students across the us nigmstraining nindsdiversity nihcoswd nihnhlbi pictwittercomsmwiutnz1v


### Create Corpora

In [14]:
corpus_all = df_all["text"].values.tolist()

corpora = [df["text"].values.tolist() for df in dataframes]

### Create Tokens

In [15]:
tokenizer = nltk.tokenize.TreebankWordTokenizer()

tokens_all = tokenizer.tokenize(" ".join(str(text) for text in tqdm(corpus_all)))

tokens = [tokenizer.tokenize(" ".join(str(text) for text in corpus)) for corpus in tqdm(corpora)]

  0%|          | 0/149998 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

### Create Vocabularies

In [16]:
# unique vocabulary of each class
vocabulary_all = set(tokens_all)

vocabularies = [set(tokens_split) for tokens_split in tqdm(tokens)]

  0%|          | 0/4 [00:00<?, ?it/s]

In [17]:
vocabulary_size_all = len(vocabulary_all)

vocabulary_sizes = [len(vocabulary) for vocabulary in vocabularies]

In [18]:
tokens[1].count("shooting")

386

In [19]:
print(f"Vocabulary size: {vocabulary_size_all}")
for i, vocab_size in enumerate(vocabulary_sizes):
    print(f"Vocabulary size of split {i+1}: {vocab_size}")

Vocabulary size: 220582
Vocabulary size of split 1: 90027
Vocabulary size of split 2: 100378
Vocabulary size of split 3: 94680
Vocabulary size of split 4: 67276


# Word Embeddings

1. Static: Word2Vec (Gensim)

2. Temporal: TWEC/CADE

## 1. Word2Vec (static)

In [20]:
from gensim.models import Word2Vec

In [21]:
from nltk import word_tokenize

tokenized_sentences_all = [word_tokenize(item) for item in tqdm(corpus_all)]

  0%|          | 0/149998 [00:00<?, ?it/s]

In [22]:
w2v_model = Word2Vec(tokenized_sentences_all, min_count = 5, seed=1040)

In [23]:
test_word = "trump"
w2v_model.wv.most_similar(test_word)

[('biden', 0.7552129626274109),
 ('donald', 0.7537271976470947),
 ('trumps', 0.7485966682434082),
 ('obama', 0.7285426259040833),
 ('thenpresident', 0.6505183577537537),
 ('joe', 0.6470562219619751),
 ('presidency', 0.6236420273780823),
 ('desantis', 0.6080209612846375),
 ('barack', 0.5927667617797852),
 ('bidens', 0.5920045971870422)]

In [24]:
w2v_model.save("model/static/word2vec.model")

## 2. Temporal: TWEC/CADE

***Temporal Word Embeddings with a Compass***

* [Source-Code](https://github.com/valedica/twec)

* [Paper](https://arxiv.org/abs/1906.02376)

* [Blogpost](https://fede-bianchi.medium.com/aligning-temporal-diachronic-word-embeddings-with-a-compass-732ab7427955)

Save the concatenated text to txt-files to make them usable for Cade

In [25]:
import smart_open
print(smart_open.__version__)

1.10.0


In [26]:
from cade.cade import CADE

In [27]:
cade_dir = DIR / "cade"
cade_split_dir = cade_dir / sub_dir
cade_split_dir.mkdir(parents=True, exist_ok=True)

file_paths = [(cade_split_dir / csv_file.split("/")[-1].split(".")[0]).with_suffix(".txt") for csv_file in csv_files]

file_paths_and_corpora = {
    cade_dir / 'compass.txt': corpus_all
}

for key, value in zip(file_paths, corpora):
    file_paths_and_corpora[key] = value

for file_path, corpus in file_paths_and_corpora.items():
    with open(file_path, 'w') as file:
        for item in corpus:
            file.write("%s\n" % item)

In [28]:
# only needed once for the installation & creation of a venv
'''%%capture
!pip install -U cade
!pip install git+https://github.com/vinid/gensim.git''';

Create & train the compass

This creates atemporal context and target word embeddings

In [29]:
import gensim
print(gensim.__version__)

3.8.0


In [30]:
warnings.filterwarnings("ignore")

aligner = CADE(size=30, min_count = 5)
aligner.train_compass(str((cade_dir / "compass").with_suffix(".txt")), overwrite=True);

Training the compass from scratch.


In [31]:
file_paths

[PosixPath('data/cade/quarter/01_Jun_to_01_Sep.txt'),
 PosixPath('data/cade/quarter/01_Sep_to_01_Dec.txt'),
 PosixPath('data/cade/quarter/01_Dec_to_01_Mar.txt'),
 PosixPath('data/cade/quarter/01_Mar_to_28_Apr.txt')]

In [32]:
warnings.filterwarnings("ignore")

# train slices, they will be already aligned
slices = [aligner.train_slice(file_path, save=True) for file_path in file_paths] # list of gensim word2vec objects

Training embeddings: slice data/cade/quarter/01_Jun_to_01_Sep.txt.
Initializing embeddings from compass.
Training embeddings: slice data/cade/quarter/01_Sep_to_01_Dec.txt.
Initializing embeddings from compass.
Training embeddings: slice data/cade/quarter/01_Dec_to_01_Mar.txt.
Initializing embeddings from compass.
Training embeddings: slice data/cade/quarter/01_Mar_to_28_Apr.txt.
Initializing embeddings from compass.


Load the models

In [33]:
print("done")

done
