In [1]:
import numpy as np
import pandas as pd

from pandas.core.frame import DataFrame
from pandas.core.indexes.datetimes import DatetimeIndex

from pathlib import Path, PosixPath

from tqdm.notebook import tqdm

In [2]:
class WashData:
    def __init__(self) -> None:
        self.data_path = Path('./data/')
        
        self.time_list = self._get_time_list()

    def _get_time_list(self) -> list:
        return [
            '{}:{}'.format(
                i.split(' ')[1].split(':')[0],
                i.split(' ')[1].split(':')[1]
            ) for i in pd.date_range(
                start='2020-09-01',
                end='2020-09-02',
                freq='5Min',
                closed='left'
            ).astype(str)
        ]

    def _get_date_dict(self, date_list: list) -> dict:
        return dict(zip(
            [
                '{}{}'.format(
                    i.split('-')[1],
                    i.split('-')[2]
                ) for i in date_list.astype(str)
            ],
            date_list
        ))
    
    def _structured_data(
        self,
        data_path: PosixPath,
        file: PosixPath,
        date_list: list,
        date_dict: dict
    ) -> None:
        template = pd.DataFrame(
            columns=date_list,
            index=self.time_list
        )
        
        df = pd.read_excel(
                file
            ).drop(['Unnamed: 0'], axis=1).set_index(['time']).replace(0, np.nan)
        for raw_row in df.index:
            for raw_col in df.columns:
                try:
                    r = '%04d' % raw_row
                    row = '{}:{}'.format(r[:2], r[2:])
        
                    col = date_dict[raw_col]
                    template.loc[row, col] = df.loc[raw_row, raw_col]
                except:
                    continue
                    
        template.to_excel(
            Path.joinpath(
                data_path,
                '{}.xlsx'.format(file.stem)
            )
        )

In [3]:
class WashSepNov(WashData):
    def __init__(self) -> None:
        super().__init__()
        
        self.sep_nov_raw_data_path = Path.joinpath(self.data_path, '速度（9-11月）')
        self.sep_nov_data_path = Path.joinpath(self.data_path, '速度(9-11月)')
        
        self.sep_nov_date_list = self._get_sep_nov_date()
        self.date_dict = self._get_date_dict(self.sep_nov_date_list)
    
    def _get_sep_nov_date(self) -> DatetimeIndex:
        return pd.date_range(
            start='2020-09-01',
            end='2020-11-30',
            freq='D'
        ).drop(
            pd.date_range(
                start='2020-10-01',
                end='2020-10-08',
                freq='D'
            )
        )
    
    def run(self) -> None:
        print('正在清洗9-11月的数据，请稍候…')
        
        with tqdm(total=231) as pbar:
            for file in self.sep_nov_raw_data_path.iterdir():
                if file.suffix == '.xls':
                    self._structured_data(
                        self.sep_nov_data_path,
                        file,
                        self.sep_nov_date_list,
                        self.date_dict
                    )
                pbar.update(1)

In [4]:
class WashMar(WashData):
    def __init__(self) -> None:
        super().__init__()
        
        self.mar_raw_data_path = Path.joinpath(self.data_path, '速度（3月）')
        self.mar_data_path = Path.joinpath(self.data_path, '速度(3月)')
        
        self.mar_date_list = self._get_mar_date()
        self.date_dict = self._get_date_dict(self.mar_date_list)
    
    def _get_mar_date(self) -> DatetimeIndex:
        return pd.date_range(
            start='2020-03-01',
            end='2020-04-01',
            freq='D',
            closed='left'
        )
    
    def run(self) -> None:
        print('正在清洗3月的数据，请稍候…')
        
        with tqdm(total=232) as pbar:
            for file in self.mar_raw_data_path.iterdir():
                if file.suffix == '.xls':
                    self._structured_data(
                        self.mar_data_path,
                        file,
                        self.mar_date_list,
                        self.date_dict
                    )
                pbar.update(1)

In [5]:
def main() -> None:
    WashSepNov().run()
    WashMar().run()

if __name__ == '__main__':
    main()

正在清洗9-11月的数据，请稍候…


  0%|          | 0/231 [00:00<?, ?it/s]

正在清洗3月的数据，请稍候…


  0%|          | 0/232 [00:00<?, ?it/s]