In [1]:
import requests as rq
from datetime import timedelta, datetime, date
import io
import tarfile as tf
import gzip as gz
import pandas as pd

from ApiHandler import ApiHandler

In [2]:
class ApiMarketHistory(ApiHandler):
    settings = {
        **ApiHandler.settings,
        'day_delay': 2,
        'record_warning': 40000
    }
    url = {
        **ApiHandler.url,
        'data': 'https://storage.googleapis.com/evekit_md/{year:04}/{month:02}/{day:02}/market_{year:04}{month:02}{day:02}.tgz',
    }
    sql = {
        **ApiHandler.sql,
        'date_check': 'SELECT DISTINCT record_date FROM MarketHistory;'
    }
    script_vals = {
        **ApiHandler.script_vals,
        'table': 'MarketHistory'
    }
    name = 'EVEKit Market History API'
    
    @property
    def data_date(self):
        data_date = date.today() - timedelta(days=self.settings['day_delay'])
        return data_date
            
    def get_raw_data(self, date=None):
        if date is None: date = self.data_date
        if self.verbose: self._verbose('get_raw_data', 'Getting raw data for date {date}...'.format(**{'date': date}))
        
        data_conn = rq.get(self.url['data'].format(**{
            'year': date.year,
            'month': date.month,
            'day': date.day
        }))
        
        if data_conn.status_code != 200:
            raise Exception("""\
                Connection returned a {status} code on pull for {date}.
                Message body:
                {body}\
            """.format(**{
                'status':data_conn.status_code,
                'date': date,
                'body':data_conn.content.decode('utf-8')
            }))
            
        if self.verbose: self._verbose('get_raw_data', 'Raw data acquired.')
        return data_conn.content
            
    def build_data(self, raw_data):
        if self.verbose: self._verbose('build_data', 'Building data frame...')
        tar_file = tf.open(fileobj=io.BytesIO(raw_data))
    
        data_frame = pd.concat([
            self._parse_data(tar_file.extractfile(type_file).read())
            for type_file in tar_file
        ]).reset_index(drop=True)

        data_frame['record_date'] = pd.to_datetime(data_frame['record_date']/1000,unit='s').dt.strftime('%Y-%m-%d')
        
        if self.verbose: self._verbose('build_data', 'Data frame built. %s records.' % len(data_frame))
            
        if len(data_frame) < self.settings['record_warning']:
            if self.verbose:
                self._verbose(
                    'build_data', 
                    'Data frame is {records}, which is under warning threshold of {record_warning} records. Sending e-mail.'.format(**{
                        'records': len(data_frame),
                        'record_warning': self.settings['record_warning']
                    })
                )
            self._email(
                'warning',
                """\
                Data extracted for date {date} has {records}, which is below the threshold of {record_warning} records.
                This is a warning, not an error. Barring no other problems, the process will complete sucessfully.\
                """.format(**{
                    'date': data_frame['record_date'].unique()[0],
                    'records': len(data_frame),
                    'record_warning': self.settings['record_warning']
                })
            )
            
        error_rows = data_frame['avg_price'] > data_frame['high_price']
        if error_rows.sum() > 0:
            if self.verbose:
                self._verbose(
                    'build_data', 
                    'Data frame has {records} with flipped avg_price and high_price. Correcting & sending e-mail.'.format(**{
                        'records': error_rows.sum(),
                        'record_warning': self.settings['record_warning']
                    })
                )
            self._email(
                'warning',
                """\
                Data extracted for date {date} has {records} with flipped avg_price and high_price. Values are corrected.
                This is a warning, not an error. Barring no other problems, the process will complete sucessfully.\
                """.format(**{
                    'date': data_frame['record_date'].unique()[0],
                    'records': error_rows.sum(),
                    'record_warning': self.settings['record_warning']
                })
            )
            data_frame.loc[error_rows] = data_frame.loc[error_rows].rename(columns={'avg_price':'high_price', 'high_price':'avg_price'})
            
        return data_frame
        
    def _parse_data(self, type_file):
        data_file = pd.read_csv(
            io.BytesIO(gz.open(io.BytesIO(type_file)).read()),
            header=None, index_col=None,
            names=('type_id','region_id','order_count','low_price','avg_price','high_price','volume','record_date')
        )
        return data_file

In [6]:
api = ApiMarketHistory()
#api.run_process()

In [30]:
#import tqdm
#for date_val in tqdm.tqdm_notebook(pd.date_range(date(2019,7,27), date(2019,8,23))):
raw_data = api.get_raw_data(date(2019,9,25))
#data_frame = api.build_data(raw_data)
#api.insert_data(data_frame)

In [31]:
tar_file = tf.open(fileobj=io.BytesIO(raw_data))

data_files = pd.concat([
    api._parse_data(tar_file.extractfile(type_file).read())
    for type_file in tar_file
]).reset_index(drop=True)

In [32]:
mask = data_files['avg_price'] > data_files['high_price']
data_files.loc[mask]

Unnamed: 0,type_id,region_id,order_count,low_price,avg_price,high_price,volume,record_date
1,18,10000016,27,3.980000e+01,4.828000e+01,4.800000e+01,533047,1569369600000
2,18,10000064,43,3.671000e+01,4.500000e+01,3.720000e+01,1206728,1569369600000
4,18,10000002,63,3.501000e+01,5.000000e+01,4.996000e+01,3305782,1569369600000
22,20,10000016,24,7.501000e+01,3.739900e+02,1.900000e+02,170029,1569369600000
32,20,10000067,8,1.210000e+02,3.889900e+02,2.790000e+02,64664,1569369600000
...,...,...,...,...,...,...,...,...
43286,52694,10000032,10,2.500000e+06,4.499000e+06,4.230714e+06,22,1569369600000
43289,52694,10000035,4,1.696000e+06,1.698759e+06,1.696345e+06,8,1569369600000
43292,52694,10000039,15,1.900000e+06,2.500000e+06,1.920690e+06,29,1569369600000
43294,52694,10000069,5,2.489000e+06,2.500000e+06,2.490571e+06,7,1569369600000
