In [1]:
import unittest
import pandas as pd 
from glob import glob

import sys
sys.path.append('python')

from localDataAccess import *
from datedFiles import *
from dataLoaderMethods import *
from dataLoader import DataLoader




In [22]:
path = 'python/tests/unit/'

class TestDataLoader(unittest.TestCase):

    @classmethod
    def setUpClass(cls):
        files_original = glob(f'{path}fixtures/lambda_func/original/*')
        files_wordindex = glob(f'{path}fixtures/lambda_func/wordindex/*')
        localdata = LocalDataAccess(f'{path}fixtures/lambda_func/original')
        cls.data1 =  pd.read_json(files_original[0], orient='records', lines=True)
        cls.dateddatafiles = DatedDataFiles(files_original, id_varname = 'id', date_prefix='records_')
        base_timestamp = pd.Timestamp('2020-08-25')
        cls.datedfilenamefilter = DatedFilenameFilter(base_timestamp, days=2, no_newer=True)
        cls.datedwordindexfilter = DatedWordindexFilter(files_wordindex, 'protest')
        cls.dataloader = DataLoader(localdata)
        cls.df = cls.dataloader.load(cls.dateddatafiles, cls.datedfilenamefilter, cls.datedwordindexfilter,
            orient='records', lines=True, float_dtype='float64')

    def test_DataLoader_dtypes(self):
        dtypes = self.data1.dtypes
        self.assertTrue(
            all([self.df.dtypes[var] == dtypes[var] for var in self.data1.columns]),
            'data types should be:' + str(dtypes))

    def test_DataLoader_len(self):
        self.assertEqual(len(self.df), 94220, 'len() should be' + str(94220))


In [23]:
test1 = TestDataLoader()
test1.setUpClass()

In [18]:
test1.test_DataLoader_dtypes()

In [19]:
test1.dateddatafiles.files

['python/tests/unit/fixtures/lambda_func/original/records_2020-08-23.json',
 'python/tests/unit/fixtures/lambda_func/original/records_2020-08-24.json',
 'python/tests/unit/fixtures/lambda_func/original/records_2020-08-25.json']

In [20]:
test1.test_DataLoader_len()

In [24]:
len(test1.df)

140078

In [7]:
class TestDataLoaderMethods(unittest.TestCase):
    
    @classmethod
    def setUpClass(cls):
        files_original = glob(f'{path}fixtures/lambda_func/original/*')
        files_wordindex = glob(f'{path}fixtures/lambda_func/wordindex/*')
        cls.localdata = LocalDataAccess('python/tests/unit/fixtures/lambda_func/original')
        cls.data1 =  pd.read_json(files_original[0], orient='records', lines=True)
        cls.dateddatafiles = DatedDataFiles(files_original, id_varname = 'id', date_prefix='records_')
        cls.datedwordindexfilter = DatedWordindexFilter(files_wordindex, 'protest')
        cls.dateddatafiles.apply_id_filter(cls.datedwordindexfilter)
        cls.dataloaderjson = DataLoaderJson(orient='records', lines=True, float_dtype='float64')
        cls.dataloaderjson.load_data(cls.dateddatafiles, cls.localdata)
        cls.dataloadercsv = DataLoaderCSV(float_dtype='float64')

        files_sentiments = glob(f'{path}fixtures/lambda_func/sentiments/*')
        cls.localdata2 = LocalDataAccess('python/tests/unit/fixtures/lambda_func/sentiments')
        cls.data2 =  pd.read_csv(files_sentiments[0])
        cls.dateddatafiles2 = DatedDataFiles(files_sentiments, id_varname = 'id', date_prefix='records_')
        cls.dateddatafiles2.apply_id_filter(cls.datedwordindexfilter)
        cls.dataloadercsv.load_data(cls.dateddatafiles2, cls.localdata2)

    def test_DataLoaderJson_dtypes(self):
        dtypes = self.data1.dtypes
        self.assertTrue(
            all([self.dataloaderjson.df.dtypes[var] == dtypes[var] for var in self.data1.columns]),
            'data types should be:' + str(dtypes))

    def test_DataLoaderJson_len(self):
        self.assertEqual(len(self.dataloaderjson.df), 210122, 'len() should be' + str(210122))


    def test_DataLoaderCSV_dtypes(self):
        dtypes = self.data2.dtypes
        self.assertTrue(
            all([self.dataloadercsv.df.dtypes[var] == dtypes[var] for var in self.data2.columns]),
            'data types should be:' + str(dtypes))

    def test_DataLoaderCSV_len(self):
        self.assertEqual(len(self.dataloadercsv.df), 309546, 'len() should be' + str(309546))


In [8]:
test2=TestDataLoaderMethods()
test2.setUpClass()

In [9]:
test2.test_DataLoaderCSV_dtypes()

In [72]:
test2.data2.dtypes

id                int64
created_at_h     object
neg             float64
neu             float64
pos             float64
compound        float64
dtype: object

In [73]:
test2.dataloadercsv.df.dtypes

id                int64
created_at_h     object
neg             float32
neu             float32
pos             float32
compound        float32
dtype: object

In [28]:
import unittest
import pandas as pd
from pandas.testing import assert_frame_equal
from collections import Counter
from glob import glob

import sys
sys.path.append('..')

from utilities import *
from tweetRetweetData import *

path_root = 'python/tests/unit/'


class TestTweetRetweetStats(unittest.TestCase):
    
    @classmethod
    def setUpClass(cls):
        print('setting up TestTweetRetweetStats')
        path = f'{path_root}fixtures/lambda_func/'
        cls.path = path
        df_sentiments1 = pd.read_csv(glob(path + 'sentiments/*')[0])
        df_emotions1 = pd.read_csv(glob(path + 'emotions/*')[0])
        df_original1 = pd.read_json(glob(path + 'original/*')[0], orient='records', lines=True)
        df_retweet1 = pd.read_json(glob(path + 'retweet/*')[0], orient='records', lines=True)
        df_words1 = pd.read_json(glob(path + 'words/*')[0], orient='records', lines=True)
        base_timestamp = pd.Timestamp('2020-08-24-22')
        cls.stat_sentiments1 = calc_stat_sentiments(df_sentiments1)
        cls.stat_emotions1 = calc_stat_emotions(df_emotions1)
        tweet_retweet_data = TweetRetweetData(df_original1, df_retweet1, df_words1, now=base_timestamp)
        cls.stat_words1 = tweet_retweet_data.stat_words
        cls.top_tweets1 = tweet_retweet_data.top_tweets
        cls.top_users1 = tweet_retweet_data.top_users

    def test_stat_sentiments(self):
        expected = pd.read_csv(self.path + 'validation_objects/' + 'stat_sentiments1.csv')
        assert_frame_equal(self.stat_sentiments1, expected)

    def test_stat_sentiments(self):
        expected = pd.read_csv(self.path + 'validation_objects/' + 'stat_emotions1.csv')
        assert_frame_equal(self.stat_emotions1, expected)

    def test_stat_words(self):
        expected = pd.read_json(self.path + 'validation_objects/' + 'stat_words1.json', orient='records',lines=True)
        self.assertTrue(
            all(list(self.stat_words1.columns == expected.columns) + \
                [len(self.stat_words1) == len(expected)]), 'should be' + str(len(expected)))

    def test_top_tweets(self):
        expected = pd.read_json(self.path + 'validation_objects/' + 'top_tweets1.json', orient='records',lines=True)
        expected['RT_id'] = expected['RT_id'].astype(str)
        self.top_tweets1['followers_count'] = self.top_tweets1['followers_count'].astype(int)
        self.top_tweets1['retweet_timespan'] = self.top_tweets1['retweet_timespan'].astype(int)
        self.top_tweets1['retweet_total'] = self.top_tweets1['retweet_total'].astype(int)
        assert_frame_equal(self.top_tweets1, expected)

    def test_top_users(self):
        expected = pd.read_json(self.path + 'validation_objects/' + 'top_users1.json', orient='records',lines=True)
        expected['user_id'] = expected['user_id'].astype(str)
        expected['RT_id'] = expected['RT_id'].astype(str)
        self.top_users1['followers_count'] = self.top_users1['followers_count'].astype(int)
        self.top_users1['following_count'] = self.top_users1['following_count'].astype(int)
        self.top_users1['retweeted'] = self.top_users1['retweeted'].astype(int)
        assert_frame_equal(self.top_users1, expected)



In [29]:
test3 = TestTweetRetweetStats()
test3.setUpClass()


setting up TestTweetRetweetStats


In [38]:
test3.test_stat_words()

In [42]:
test3.stat_words1.set_index('subset').loc['today']

token_counter    {'black': 1840, 'say': 1851, 'man': 1641, 'bre...
count                                                        10000
Name: today, dtype: object

In [35]:
expected = pd.read_json(test3.path + 'validation_objects/' + 'stat_words1.json', orient='records',lines=True)

In [43]:
expected.set_index('subset').loc['today']

token_counter    {'say': 1937, 'car': 1660, 'get': 1609, 'break...
count                                                        10000
Name: today, dtype: object

In [45]:
test3.stat_words1.set_index('subset').loc['today']
#

token_counter    {'black': 1840, 'say': 1851, 'man': 1641, 'bre...
count                                                        10000
Name: today, dtype: object

In [46]:
expected.set_index('subset').loc['today']

token_counter    {'say': 1937, 'car': 1660, 'get': 1609, 'break...
count                                                        10000
Name: today, dtype: object