In [1]:
import numpy as np
import pandas as pd
import benedict

import config
import pinky

client = config.client
db = client[config.database]
col = db[config.weathers_collection]

In [2]:
pd.set_option('display.max_colwidth', None)
pd.options.display.float_format = '{:,.2f}'.format

In [3]:
def update_keys(ref, check, as_kv_list=False):
    ''' Make sure that the keys for check are the same as those in ref.
    
    :param ref: the dictionary whose keys are to be referenced
    :type ref: dict
    :param check: the dict to have its keys checked and uppdated
    :type check: dict
    '''
    
    if not as_kv_list:
        keys1 = ref.keys()
        keys2 = check.keys()
    else:
        keys1 = [tup[0] for tup in ref]
        keys2 = [tup[0] for tup in check]
    diff1 = keys2 - keys1  #keys in dict2 that are not in dict1
    diff2 = keys1 - keys2  #keys in dict1 that are not in dict2
    for item in diff1:
        check.pop(item)
    for item in diff2:
        check[item] = None
    return

def strip_keys(df):
    ''' Take a pandas.DataFrame and replace each dict with the list of
    its values.
    '''
    
    t = []
    
    def dict_strip(x):
        ''' Strip the keys from a dict. '''
        
        if isinstance(x, dict):
            return [x for x in x.values()]
        else:
            return x
    
    for row in df.iterrows():
        temp = []
        for item in row[1]:
            temp.append(dict_strip(item))
        t.append(pd.Series(temp, name=row[1].name, dtype=object))
    return pd.concat(t, axis=1, ignore_index=False)

def compare_dicts(one, the_other, return_type='dict', as_kv_list=False):
    ''' Compare the values of two dicts, key by common key. When the values are
    numbers, return the difference: when strings, return 0: if the strings are
    equal, 0, 1 if they are different: when dicts, run this function: if it's a
    list then step through it, running this function on each element: when
    NoneType, set it to a flag value.

    :params one, the_other: dictionaries with the same set of keys and sub-keys
    :type one, the_other: dict
    '''
    
    delta = {}  # The delta document. Contains all the forecast errors
    
    if as_kv_list:
        for (k, v) in one:
            try:
                # Check and compare dictionaries according to their value type
                if type(v) == int or type(v) == float:
                    if type(the_other[k]) == int or type(the_other[k]) == float:
                        delta[k] = v - the_other[k]
                elif type(v) == dict:
                    delta[k] = compare_dicts(
                        v, the_other[k], return_type='list', as_kv_list=True)
                elif type(v) == str:
                    if v == the_other[k]:
                        delta[k] = 0
                    else:
                        delta[k] = 1
                elif type(v) == list:
                    delta[k] = [
                        compare_dicts(item, other_item)
                        for item, other_item
                        in list(zip(v, the_other[k]))
                    ]
                elif type(v):
                    delta[k] = 0
            except KeyError as e:
                print(f'missing key..... {e}')
    else:
        for (k, v) in one.items():
            try:
                # Check and compare dictionaries according to their value type
                if type(v) == int or type(v) == float:
                    if type(the_other[k]) == int or type(the_other[k]) == float:
                        delta[k] = v - the_other[k]
                elif type(v) == dict:
                    delta[k] = compare_dicts(v, the_other[k], return_type='list')
                elif type(v) == str:
                    if v == the_other[k]:
                        delta[k] = 0
                    else:
                        delta[k] = 1
                elif type(v) == list:
                    delta[k] = [
                        compare_dicts(item, other_item)
                        for item, other_item
                        in list(zip(v, the_other[k]))
                    ]
                elif type(v):
                    delta[k] = 0
            except KeyError as e:
                print(f'missing key..... {e}')
    if return_type == 'dict':
        return delta
    if return_type == 'list':
        return [v for v in delta.values()]

def tups_to_dict(tups):
    ''' Convert a list of tuples to a dictionary. '''
    dicti = {}
    for a, b in tups:
        dicti.setdefault(a, b)
    return dicti 


In [4]:
def read_mongo_to_df(collection, filters={}, limit=None):
    ''' Read a MongoDB cursor to a pandas DataFrame.
    Arguments are "collection", which must be a MongoDB
    client.database.collection object, and "filters", which
    can be a well formed mongo query. "limit" will limit
    the number of documents returned on the cursor.
    '''

    documents = collection.find(filters)[:limit]
    return pd.DataFrame.from_records([doc for doc in documents])

def records_to_rows(col, filters={}, limit=100):
    ''' Request records from the database collection and convert it to a
    pandas.DataFrame. All records are set with keys as column names and
    '_id' as the index.
    '''
    
    docs = col.find(filters, batch_size=100)[:limit]
    weathers = pd.DataFrame()
    temp = []
    for row in docs:
        if isinstance(row, dict):
            # Lookout for the occurance of a list and handle appropriately.
            for v in row.values():
                if isinstance(v, list):
                    row['weather'] = row['weather'][0]
            # These next lines convert the dicts to benedicts before
            # flattening, sorting by keys, and then converting back to dicts.
            bene = benedict.benedict.flatten(row)
            flat_bene = benedict.benedict(bene)
            sorted_flat_bene = flat_bene.items_sorted_by_keys()
            sorted_flat_dict = tups_to_dict(sorted_flat_bene)
            # Store in temp list as a pandas.DataFrame.
            temp.append(pd.DataFrame(sorted_flat_dict, index=[row['_id']]))
    return pd.concat(temp)

def read_mongo_a(col, filters={}, limit=None):
    ''' Retrieve data from the Mongo database and transform it to a pandas
    DataFrame; return the DataFrame.

    :param col: the MongoDB collection to be read
    :type collection: pymongo.collection.Collection
    :param filters: a well formed MongoDB query
    :type filters: dict
    :param limit: optional limiter to the number of documents retrieved
    :type limit: int
    '''

    # Shorten the cursor length if limit is given, otherwise get everything;
    # transform the retrieved data to a pandas.DataFrame and return it.
    docs = col.find(filters)[:limit]
    weathers = []
    for doc in docs:
        if isinstance(doc, dict):
            for v in doc.values():
                if isinstance(v, list):
                    doc['weather'] = doc['weather'][0]
        # Convert the dict to a benedict, flatten it, sort it, convert it back
        # to a dict, and finally transform the dict to a DataFrame and append
        # it to a list to tbe concatted to together.
        bene = benedict.benedict(doc).flatten().items_sorted_by_keys()
        dic = tups_to_dict(bene)
        df = pd.DataFrame.from_dict(dic, orient='index')
        weathers.append(df.transpose())
    if limit:
        print(f'The length of your df has been limited to {limit}.')
    return pd.concat(weathers)


In [5]:
def find_item_with_kv_pair(series, key, value):
    '''Find and return the first item in a given pandas Series that has the
    given key-value pair.
    
    :param series: a pandas series
    :type series: pandas.Series
    :param key: the key the function should search for
    :type key: str
    :param value: the value the function should compare to
    :type value: I think anything that '==' can be used with
    
    :returns: the object found or None or raises TypeError
    '''
    
    if isinstance(series, pd.Series):
        for item in series:
            if isinstance(item, dict):
                if key in item:
                    if item[key] == value:
                        return item
            elif isinstance(item, list):
                for elem in item:
                    if elem[0] == key:
                        if elem[1] == value:
                            return item
        return None
    else:
        raise TypeError("find_item_with_key() wants a pandas.Series.")
        return
    
def flatten_to_series(df):
    ''' A function to convert a DataFrame to a Series.
    This function takes each row of the dataframe and represents it as a
    Series with a given index made by the string concatenation of the row
    number and the column name.
    
    :param df: the dataframe to be flattened
    :type df: pandas.DataFrame
    '''

    index = []
    data = []
    for row in df.iterrows():
        for d, i in zip(row[1], row[1].index):
            index.append(str(i)+str(row[0]))
            data.append(d)
    d = pd.DataFrame(data, index=index)
    return d

def flatten_to_single_row(df):
    ''' A function to convert a DataFrame to a single row DataFrame.
    This function takes each row of the dataframe and represents it as a
    single DataFrame row with a given index made by the string concatenation
    of the row number and the column name.
    
    :param df: the dataframe to be flattened
    :type df: pandas.DataFrame
    '''

    df.reset_index(inplace=True)
    index = []
    data = []
    for row in df.iterrows():
        for d, i in zip(row[1], row[1].index):
            index.append(str(i)+str(row[0]))
            data.append(d)
    d = pd.DataFrame(data, index=index)
    return d.transpose()

def flat_and_concat(flist):
    ''' Flatten a list of DataFrames and concat the flattened versions
    together and return as a single DataFrame.
    
    This is useful when you have a list of DataFrames, each of them
    representing a collection of related dataset
    
    :param flist: At list of pandas.DataFrames.
    :type flist: list
    '''
    D = []
    if not isinstance(flist, list):
        raise TypeError('flat_and_concat() has to have a list of DataFrames.')
        return
    for item in flist:
        if isinstance(item, pandas.DataFrame):
            D.append(flatten_to_single_row(item))
    return pd.concat(D)

def make_instants(df, _return=True):
    ''' Convert the rows of the weathers collection DataFrame to a DataFrame of
    instants.
    
    This is useful when you have a DataFrame already built from rows of raw 
    data representing individual documents as they came from the database and
    you want to create a DataFrame of flattened DataFrames
    '''
    
    d = []
    timeplaces = df.index.unique(level='timeplace')
    for tp in timeplaces:
        temp_df = df.loc[tp]
        d.append(flatten_to_single_row(temp_df))
    if _return:
        return pd.concat(d)
    else:
        np.save('instants.npy', pd.concat(d))
        return

def make_inst(df):
    ''' Create instant Series from the DataFrame: step through each row of the
    DataFrame and check the count of the row. If it is 42 or more, drop any na
    values, flatten each dict and append the Series to a new DataFrame and
    return it.
    '''
    
    instants = []
    for row in df.iterrows():
        if row[1].count() <= 37:
            continue
        row[1].dropna(inplace=True)
#         row[1].name = row[0]
        obs = find_item_with_kv_pair(row[1], 'type', 'obs')
        for item in row[1].iteritems():
            if isinstance(item[1], dict):
                for v in item[1].values():
                    if isinstance(v, list):
                        item[1]['weather'] = item[1]['weather'][0]
                if obs != None:
                    if item[1]['type'] == 'cast':
                        update_keys(item[1], obs)
            if isinstance(item[1], list) and obs != None:
                if item[1][0] == 'cast':
                    update_keys(item[1], obs)
        # These next lines convert the dicts to benedicts before flattening,
        # sorting by keys, and then converting back to dicts.
        flat_data = row[1].apply(benedict.benedict.flatten)
        sorted_items = flat_data.apply(benedict.benedict.items_sorted_by_keys)
        flat_sorted_data = sorted_items.apply(tups_to_dict)
        instants.append(flat_sorted_data)
    instants = pd.concat(instants, axis=1, ignore_index=False).transpose()
    np.save('instants.npy', instants)
    return instants

def make_data(series):
    ''' Take a pandas.Series and compare each of the items to one of the other
    items (dict comparisons) and return a pandas.Series of comparison results.
    '''

    data = []
    
    def key_strip(x):
        ''' Strip the keys from a dict. '''

        if isinstance(x, dict):
            return [x for x in x.values()]
        else:
            return x

    for item in series.iteritems():
        if isinstance(item[1], dict):
            data.append(key_strip(item[1]))
    return pd.Series(data, name=series.name, dtype=object)

def make_data_df(df):
    ''' Create the DataFrame that will contain the data to be used as the
    Data dataset to go along with the Target dataset. First make the instants
    DataFrame, then go through it row by row and remove all the items that
    are observation data. Finally save.
    '''
    
    data = []
    for row in df.iterrows():
#         data.append(make_data(row[0]))
#         row[1].name = row[0]
        obs = find_item_with_kv_pair(row[1], 'type', 'obs')
        for item in row[1].iteritems():
            if isinstance(item[1], dict):
                if obs != None:
                    if item[1]['type'] == 'obs':
#                         print(item)
                        row[1].pop(item[0])
                        break
        data.append(make_data(row[1]))
    data_df = pd.concat(data, axis=1, ignore_index=False).transpose()
#     data_df = strip_keys(data_df)#.transpose()
    np.save('forecast_values.npy', data_df)
    return data_df
    

def make_deltas(series):
    ''' Take a pandas.Series and compare each of the items to one of the other
    items (dict comparisons) and return a pandas.Series of comparison results.
    '''

    deltas = []
    obs = find_item_with_kv_pair(series, 'type', 'obs')
    for item in series:
        if isinstance(item, dict) and obs != None:
            if item['type'] == 'cast':
                update_keys(item, obs)
                deltas.append(compare_dicts(obs, item, return_type='list'))
        if isinstance(item, list) and obs != None:
            if item[0] == 'cast':
                update_keys(item, obs)
                deltas.append(compare_dicts(obs, item, return_type='list'))
    return pd.Series(deltas, name=series.name, dtype=object)

def make_deltas_df(df):
    ''' Build the complete deltas DataFrame. '''
    
    deltas = []
    deltas_df = pd.DataFrame()
    
    # Create a DataFrame of the delta documemnts derived from the rows of
    # the supplied DataFrame. Add the DataFrame to a list so that it all
    # concatinates to a DataFrame. Then, row by row create the "deltas" for
    # the data and add it to the list. Finally concat all that together.
    deltas.append(deltas_df)
    for row in df.iterrows():
        deltas.append(make_deltas(row[1]))
    deltas_df = pd.concat(deltas, axis=1, ignore_index=False).transpose()
    np.save('delta_values.npy', deltas_df)
    return deltas_df


In [29]:
import pymongo

# col.create_index([('timeplace', pymongo.ASCENDING)])
tpd = [doc for doc in col.find({'timeplace': 'dnhzms0000001599091200'})]
len(tpd)

30

In [38]:
filters = {'timeplace': 'dnhzms0000001599091200'}

rdf = records_to_rows(col, filters, limit=None)

In [None]:
import pinky

drop_cols = ['_id',
             'base',
             'dt',
             'cod',
             'coord_lat',
             'coord_lon',
             'sys_type',
             'dt_txt',
             'pop',
             'sys_pod',
             'weather_description',
             'weather_icon',
             'weather_id',
             'weather_main'
            ]
rdf.drop(columns=drop_cols, inplace=True)
rdf['tt_inst'] = rdf.loc[:, 'tt_inst'].apply(pinky.favor, trans=False)
rdf.set_index(['timeplace', 'tt_inst', 'type'], inplace=True)
rdf.sort_index(inplace=True)
rdf

In [96]:
instants = []
tps = []
with open('legits.txt', 'r') as tp:
    for row in tp:
        tps.append(row.strip('\n'))

def sort_out_inst(tps, limit=None):
    '''  '''
    
    instants = []
    legits = []
    wtfs = []
    not_legits = []
    for row in list(tps)[:limit]:
        filters = {'timeplace': row.strip('\n')}
        rdf = records_to_rows(col, filters, limit=None)
        if len(rdf.index) == 40:
            drop_cols = ['_id',
    #              'base',
                'dt',
    #              'cod',
    #              'coord_lat',
    #              'coord_lon',
    #              'sys_type',
                 'dt_txt',
                 'pop',
                 'sys_pod',
                 'weather_description',
                 'weather_icon',
                 'weather_id',
                 'weather_main'
                ]
            rdf.drop(columns=drop_cols, inplace=True)
            rdf['tt_inst'] = rdf.loc[:, 'tt_inst'].apply(pinky.favor, trans=False)
            rdf.set_index(['timeplace', 'tt_inst', 'type'], inplace=True)
            rdf.sort_index(inplace=True)
            instants.append(rdf)
            legits.append(row)
        elif 'obs' in rdf['type'].values:
            not_legits.append(row)
        else:
            wtfs.append(row)
        with open('legits.txt', 'w') as leg:
            for l in legits:
                leg.write(l+'\n')
        with open('not_legits.txt', 'w') as nl:
            for l in not_legits:
                nl.write(l+'\n')
        with open('wtfs.txt', 'w') as wtf:
            for l in wtfs:
                wtf.write(l+'\n')
    return pd.concat(legits)


In [97]:
from itertools import zip_longest

lst = []
def grouper(iterable, n, fillvalue=None):
    "Collect data into fixed-length chunks or blocks"
    # grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx"
    args = [iter(iterable)] * n
    return zip_longest(*args, fillvalue=fillvalue)

with open('legits.txt', 'r') as l:
    for i in l:#range(2):
        lst.append(i)
print(str(lst))
# grouper(str(lst), 22)


[]


In [None]:
inst = sort_out_inst(timeplaces, limit=10000)
inst

In [99]:
instants

[]

In [33]:
rdf.shape

(30, 25)

In [55]:
inst_df = make_instants(instants)

In [56]:
inst_df.head()

Unnamed: 0,tt_inst0,type0,clouds_all0,instant0,location_lat0,location_lon0,main_feels_like0,main_grnd_level0,main_humidity0,main_pressure0,...,coord_lon40,id40,name40,sys_country40,sys_id40,sys_sunrise40,sys_sunset40,sys_type40,timezone40,rain_1h40
0,0,obs,25,1599091200,35.05,-83.08,297.55,,70,1013,...,,,,,,,,,,
0,0,cast,99,1600916400,34.83,-84.35,288.46,954.0,90,1017,...,,,,,,,,,,
0,151200,cast,8,1601089200,33.99,-83.39,294.81,994.0,97,1016,...,,,,,,,,,,
0,151200,cast,100,1598626800,34.04,-83.12,305.22,997.0,67,1017,...,,,,,,,,,,
0,0,cast,35,1600797600,34.34,-84.05,293.0,981.0,47,1025,...,,,,,,,,,,


In [15]:
inst_df.describe()

Unnamed: 0,tt_inst0,base0,clouds_all0,cod0,coord_lat0,coord_lon0,id0,instant0,location_lat0,location_lon0,...,timezone0,type0,visibility0,wind_deg0,wind_speed0,main_grnd_level0,main_sea_level0,main_temp_kf0,rain_3h0,rain_1h0
count,1000,25,1000,25.0,25.0,25.0,25.0,1000,1000.0,1000.0,...,25.0,1000,1000,1000,1000.0,975.0,975.0,975.0,461.0,2.0
unique,41,1,81,1.0,4.0,8.0,17.0,41,4.0,8.0,...,1.0,2,26,167,333.0,20.0,9.0,62.0,248.0,2.0
top,140400,stations,100,200.0,33.82,-84.22,4218165.0,1598119200,33.82,-84.22,...,-14400.0,cast,10000,0,1.5,983.0,1016.0,0.0,0.31,0.15
freq,25,25,325,25.0,8.0,4.0,2.0,25,328.0,164.0,...,25.0,975,975,20,21.0,111.0,234.0,876.0,10.0,1.0


In [None]:
for row in inst_df.columns:
    print(row)

In [None]:
make_data_df(inst)

In [None]:
make_deltas_df(inst)