<a href="https://colab.research.google.com/github/Angorith/acorn_python/blob/main/0623_33%EC%9D%BC%EC%B0%A8_%EA%B3%B5%ED%86%B5%ED%94%84%EB%A1%9C%EC%A0%9D%ED%8A%B82%EB%B2%88%EC%8B%9C%EA%B3%84%EC%97%B4_%EA%B8%B0%EC%98%A8%EC%9E%90%EB%A3%8C%EC%B2%98%EB%A6%AC%ED%95%A8%EC%88%98.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
%%writefile timeseries_module.py
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime as dt
import glob

def get_filelist(paths):
	file_list = glob.glob(paths+'*')
	hourly_data = []
	every_minute_data = []
	file_list_dict = {}
	for i in file_list:
		if 'TIM' in i:
			hourly_data.append(i)
		else:
			every_minute_data.append(i)
	file_list_dict['TIM'] = hourly_data
	file_list_dict['MI'] = every_minute_data
	return file_list_dict

def get_data(file_list, index_column=2, parse_date=[2]):
	file_list.sort()
	data={}
	for fn in file_list:
		df = pd.read_csv(fn, index_col=index_column, \
			parse_dates=parse_date, encoding='cp949', \
			skiprows=[0], \
			names=['site', 'name', 'time', 'temp'])
		df.drop(['site', 'name'], axis=1, inplace=True)
		data[fn] = df
	return (data)

def missing_check(df, freqs):
	start = df.index[0]
	end = df.index[-1]
	timestamp = pd.date_range(start, end, freq=freqs)
	#print(timestamp)
	df = df.reindex(timestamp)
	return (df)

def physical_check(df):
	df[df<-33.0] = np.nan
	df[df>40] = np.nan
	return (df)

def step_check1(df):
	temp = df.iloc[0, 0]
	df['step_check'] = df.diff().fillna(-999999.9)
	df[df.step_check<-3.0] = np.nan
	df[df.step_check>3.0] = np.nan
	if temp:
		df.iloc[0, 0] = temp
	return (df)

def persistence_check(df):
	df['persis'] = df.step_check.abs()
	dummy_data = df.resample('H').sum()
	dummy_data.drop(dummy_data.index[-1], inplace=True)
	hour = dummy_data[dummy_data.persis<0.1].index.hour
	if len(hour):
		for i in hour:
			df[df.index.hour == i] = np.nan
	return (df)

def keep_data(df, file_name='bird'):
	start = dt.datetime.strftime(df.index[0], '%Y%m%d%H%M%S')
	end = dt.datetime.strftime(df.index[-1], '%Y%m%d%H%M%S')
	if file_name == 'bird':
		file_name = 'OBS_108_AirTemp_'+start+'_'+end+'.csv'
	df.to_csv(file_name)
	return (file_name)

def resample_hour(m_data):
	for key, df in m_data.items():
		df = df.dropna().resample('H').agg({'temp':['size', 'mean']})
		df = df.droplevel(level=0, axis=1)
		df.loc[df['size']<48, 'mean'] = np.nan
		df.dropna(inplace=True)
		df = missing_check(df, 'H')
		if 'result' in locals():
			result = pd.concat([result, df])
			continue
		result=df
	return result

def resample_day(df, col_name='mean'):
	df = df.dropna().resample('D').agg({col_name:['size', 'mean']})
	df = df.droplevel(level=0, axis=1)
	df.loc[df['size']<20, 'mean'] = np.nan
	return (df)

def resample_month(df, col_name='mean'):
	df = df.dropna().resample('M').agg({col_name:['size', 'mean']})
	df = df.droplevel(level=0, axis=1)
	df.loc[df['size']<24, 'mean'] = np.nan
	return (df)

def timeseries_plot(df):
	df.iloc[[11, 20, 21], :] = np.nan
	df.columns=['sizes', 'means']	
	data = df.loc[:'2021-09-01', 'means']
	methods = ['linear', 'quadratic', 'cubic']
	df_gapfilled = pd.DataFrame({m: data.interpolate(method=m) for m in methods})
	df_gapfilled.plot()
	#data.plot()
	plt.grid()
	plt.savefig('gf.png')
	data = data.round(2)
	df_gapfilled = df_gapfilled.round(2)
	data.to_csv('pre_gap_filling.csv')
	df_gapfilled.to_csv('gap_filled.csv')

Writing timeseries_module.py


In [6]:
%%writefile client.py
import timeseries_module as tm

paths = './drive/MyDrive/data/'

file_list_dict = tm.get_filelist(paths)

#print(file_list_dict)

data = tm.get_data(file_list_dict['MI'])

#print(data)

m_data = {}
for key, value in data.items():
	df = tm.missing_check(value, 'T')
	df = tm.physical_check(df)
	df = tm.step_check1(df)
	print(df)
	df = tm.persistence_check(df)
	key_f = tm.keep_data(df)
	m_data[key_f] = df

hourly = tm.resample_hour(m_data)
tm.keep_data(hourly, 'OBS_108_AirTemp_hourly_data') 
diurnal = tm.resample_day(hourly)
tm.keep_data(diurnal, 'OBS_108_AirTemp_diurnal_data')

h_data = tm.get_data(file_list_dict['TIM'])

for i in h_data:
	df = h_data[i]
	df = tm.resample_day(df,'temp') 
	tm.keep_data(df, 'OBS_108_AirTemp_2021_data')

df_month = tm.resample_month(df)
#print(df_month)

tm.timeseries_plot(df)

Overwriting client.py


In [None]:
%run client.py