## Test Elasticsearch Manager Class + Subreddit Data Extraction and Insertion to ES
1. Test ES manager basic functionality.
2. Test subreddit extraction and insertion to ES reddit crypto index.


In [1]:
# Import libraries
import os
import sys
import numpy as np
import pandas as pd
from tqdm import tqdm
from datetime import datetime

# Change dir
os.getcwd()
os.chdir("..")  # Change to root dir to detect other local libs

In [2]:
# Import local modules
from utils import timer
from es.manager import ESManager
from etl.schema.es_mappings import (
    REDDIT_CRYPTO_INDEX_NAME,
    reddit_crypto_mapping,
    test_reddit_crypto_mapping,
)

### 1. Test Basic ESManager Functionality

In [3]:
# Test Connection
es_conn = ESManager()
es_conn.get_status()

True

In [4]:
# Test Index Creation
TEST_REDDIT_CRYPTO_INDEX_NAME = "test-" + REDDIT_CRYPTO_INDEX_NAME

In [5]:
es_conn.create_index(
    index=TEST_REDDIT_CRYPTO_INDEX_NAME, mapping=test_reddit_crypto_mapping
)

In [6]:
es_conn.index_is_exist(index=TEST_REDDIT_CRYPTO_INDEX_NAME)

True

In [7]:
# Test Data: Dummy Reddit Comment and Submission for Testing


@timer
def gen_dummy_reddit_data(num: int = 1000):
    schema = reddit_crypto_mapping["mappings"]["properties"]
    for i in tqdm(range(num)):
        d = {
            "_id": str(i),
            "_index": TEST_REDDIT_CRYPTO_INDEX_NAME,
            "_type": "_doc",
            "_source": {key: "Hello Monkey Zoo Kangeroo" for key in schema},
        }
        yield d


# Gen dummy Data
dummy_data = gen_dummy_reddit_data(888)

Function `gen_dummy_reddit_data` took: 0.0 seconds


In [8]:
es_conn.bulk_insert_data(index=TEST_REDDIT_CRYPTO_INDEX_NAME, data=dummy_data)

  0%|          | 0/888 [00:00<?, ?it/s]

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 888/888 [00:00<00:00, 7933.39it/s]


In [9]:
# Test Querying
test_query = {"query": {"match_all": {}}}

res = es_conn.run_match_query(index=TEST_REDDIT_CRYPTO_INDEX_NAME, query=test_query)
res[:2]

[]

In [10]:
# Test Index Deletion
es_conn.delete_index(index=TEST_REDDIT_CRYPTO_INDEX_NAME)

### 2. Reddit Extraction and Insertion Pipeline Test

In [11]:
# Pull Test Data
from config.reddit_data_cfg import CRYPTO_SUBREDDITS
from data.extract.reddit_extract import *

{'author': typing.Union[str, NoneType],
 'created': <class 'datetime.datetime'>,
 'id': <class 'str'>,
 'link': typing.Union[str, NoneType],
 'selftext': typing.Union[str, NoneType],
 'subreddit': typing.Union[str, NoneType],
 'title': <class 'str'>,
 'url': <class 'str'>}
{'author': typing.Union[str, NoneType],
 'body': <class 'str'>,
 'created': <class 'datetime.datetime'>,
 'id': <class 'str'>,
 'parentId': typing.Union[str, NoneType],
 'subreddit': typing.Union[str, NoneType],
 'url': <class 'str'>}


In [12]:
# Pull out CryptoCurrency
test_subreddit = CRYPTO_SUBREDDITS[6]

# Pull Configs
test_start_date = datetime.strptime("2021-01-14", "%Y-%m-%d")
test_end_date = datetime.strptime("2021-01-19", "%Y-%m-%d")

print(
    f"""TEST DETAILS:
Subreddit={test_subreddit}
Start Date={test_start_date}
End Date={test_end_date}
Date Range={test_end_date - test_start_date}
"""
)

TEST DETAILS:
Subreddit=CryptoCurrency
Start Date=2021-01-14 00:00:00
End Date=2021-01-19 00:00:00
Date Range=5 days, 0:00:00



In [13]:
# Test get_all_crypto_subreddit_date
results = get_all_crypto_subreddit_data(
    subreddit=test_subreddit, start_date=test_start_date, end_date=test_end_date
)

results_fmt = {test_subreddit: results}

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 51441/51441 [00:03<00:00, 13380.60it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3294/3294 [00:00<00:00, 11425.36it/s]


Function `get_all_crypto_subreddit_data` took: 1080.2062118053436 seconds


In [14]:
print(f"Number of entries from subreddits: {len(results)}")

Number of entries from subreddits: 54735


In [15]:
# Test insert_reddit_to_es
insert_reddit_to_es(results_fmt)

Function `insert_reddit_to_es` took: 5.18245005607605 seconds


In [16]:
# Test Insertion by Querying
res = es_conn.run_match_query(index=REDDIT_CRYPTO_INDEX_NAME, query=test_query)
res[:3]

[{'_index': 'reddit-crypto',
  '_type': '_doc',
  '_id': 'K4REeX4Butm4XmG6kpnI',
  '_score': 1.0,
  '_source': {'id': 'gjatymp',
   'subreddit': 'CryptoCurrency',
   'create_datetime': '2021-01-15T10:20:02',
   'author': 'Buy_More_Bitcoin',
   'full_text': 'Love it\n\nCould you do snx next',
   'type': 'comment',
   'parent_id': 't3_kx45hq'}},
 {'_index': 'reddit-crypto',
  '_type': '_doc',
  '_id': 'LIREeX4Butm4XmG6kpnI',
  '_score': 1.0,
  '_source': {'id': 'gjatybb',
   'subreddit': 'CryptoCurrency',
   'create_datetime': '2021-01-15T10:19:58',
   'author': 'ccModBot',
   'full_text': 'Hello, your post was removed because your account is less than 60 days old or you do not have the required 500 **comment** karma to make post submissions.',
   'type': 'comment',
   'parent_id': 't3_kxkr6l'}},
 {'_index': 'reddit-crypto',
  '_type': '_doc',
  '_id': 'MIREeX4Butm4XmG6kpnI',
  '_score': 1.0,
  '_source': {'id': 'gjatvt9',
   'subreddit': 'CryptoCurrency',
   'create_datetime': '2021-01-

### Test Serialization of Data

In [17]:
from utils.serializer import write_to_pkl, load_fr_pkl
from dataclasses import asdict
import pickle

In [18]:
# Convert to list of dict
results_dict = [asdict(d) for d in results]

In [19]:
# Test serialization
dir_path = "/Users/christopherliew/Desktop/Y4S1/HT/crypto_uncertainty_index/data/raw_data_dump/reddit"
file_path = os.path.join(
    dir_path,
    f"TestRun_{test_subreddit}_{test_start_date.date()}_{test_end_date.date()}.pkl",
)
print(file_path)

/Users/christopherliew/Desktop/Y4S1/HT/crypto_uncertainty_index/data/raw_data_dump/reddit/TestRun_CryptoCurrency_2021-01-14_2021-01-19.pkl


In [20]:
write_to_pkl(file_path, results_dict)

In [22]:
res_from_pkl = load_fr_pkl(file_path)
res_from_pkl

[{'author': 'whatwhatwhichuser',
  'body': 'How does it generate yield?',
  'created': datetime.datetime(2021, 1, 14, 23, 59, 57),
  'id': 'gj8lqvg',
  'parentId': 't1_gj0ezr7',
  'subreddit': 'CryptoCurrency',
  'url': '/r/CryptoCurrency/comments/kvre5l/what_is_ethereum_why_is_ethereum_how_is_ethereum/gj8lqvg/'},
 {'author': 'notaselfdrivingcar',
  'body': 'only Sleeping?',
  'created': datetime.datetime(2021, 1, 14, 23, 59, 56),
  'id': 'gj8lqru',
  'parentId': 't1_gj8ggsf',
  'subreddit': 'CryptoCurrency',
  'url': '/r/CryptoCurrency/comments/kx6p08/san_francisco_man_who_cant_remember_bitcoin/gj8lqru/'},
 {'author': 'siddharta0',
  'body': 'Wasnt btc dead two days ago? üòÇüçª',
  'created': datetime.datetime(2021, 1, 14, 23, 59, 49),
  'id': 'gj8lq6i',
  'parentId': 't3_kx69v0',
  'subreddit': 'CryptoCurrency',
  'url': '/r/CryptoCurrency/comments/kx69v0/good_news_everyone_lindsay_lohan_predicts_100000/gj8lq6i/'},
 {'author': 'GoldenRain99',
  'body': "As you said, spending crypto