# 作成処理の確認

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.rcParams['font.family'] = "MS Gothic" # Use this font to avoid encoding error
import os

import sys
sys.path.append('../src/')
sys.dont_write_bytecode = True  # __pycache__ 生成を防ぐ

from vege_preprocess import Weather

# 気象データ処理の確認

### シュートリアルのコードの確認

In [2]:
init_df = pd.read_csv('../data/weather.csv', nrows =1)

In [3]:
wea_df = pd.read_csv('../data/weather.csv', dtype = init_df.head(1).dtypes.astype('str').str.replace('64', '32').to_dict())
print(wea_df.shape)
wea_df

# 年度情報を抜き出し
wea_df['year'] = wea_df['date'].apply(lambda x: int(str(x)[:4]))
wea_df['month'] = wea_df['date'].apply(lambda x: int(str(x)[4:6]))

# =========================================================================================================
# 集計作業その1
agg_cols = ['mean_temp', 'max_temp', 'min_temp', 'sum_rain', 'sun_time', 'mean_humid']
gb_df = wea_df.groupby(['area', 'year', 'month'])[agg_cols].agg(['mean','max','min']).reset_index()

new_cols = []
for col1, col2 in gb_df.columns:
    if col2:
        new_cols.append(col2+'_'+col1)
    else:
        new_cols.append(col1)
gb_df.columns = new_cols

# =========================================================================================================
# 集計作業その2
agg_cols = [i for i in gb_df.columns if i not in ['year', 'month', 'area']]
tmp_df = gb_df.groupby(['year', 'month'])[agg_cols].agg(['mean']).reset_index()

new_cols = []
for col1, col2 in tmp_df.columns:
    new_cols.append(col1)
tmp_df.columns = new_cols
tmp_df['area'] = '全国'
tmp_df = tmp_df[gb_df.columns]

# =========================================================================================================
# 結合作業
out_df = pd.concat([gb_df, tmp_df])

(204320, 10)


In [4]:
test = Weather()
gb_df_test = test.add_agg_features(wea_df, head_name = '各地')
tmp_df_test = test.add_agg_features(gb_df_test, scope=['year', 'month'], agg_types=['mean'], head_name='全国')

In [5]:
print(gb_df.shape)
print(gb_df_test.shape)
print(tmp_df_test.shape)
print(gb_df.columns)
print(gb_df_test.columns)
print(tmp_df_test.columns)

(6720, 21)
(6720, 21)
(210, 20)
Index(['area', 'year', 'month', 'mean_mean_temp', 'max_mean_temp',
       'min_mean_temp', 'mean_max_temp', 'max_max_temp', 'min_max_temp',
       'mean_min_temp', 'max_min_temp', 'min_min_temp', 'mean_sum_rain',
       'max_sum_rain', 'min_sum_rain', 'mean_sun_time', 'max_sun_time',
       'min_sun_time', 'mean_mean_humid', 'max_mean_humid', 'min_mean_humid'],
      dtype='object')
Index(['area', 'year', 'month', '各地_mean_temp_mean', '各地_mean_temp_max',
       '各地_mean_temp_min', '各地_max_temp_mean', '各地_max_temp_max',
       '各地_max_temp_min', '各地_min_temp_mean', '各地_min_temp_max',
       '各地_min_temp_min', '各地_sum_rain_mean', '各地_sum_rain_max',
       '各地_sum_rain_min', '各地_sun_time_mean', '各地_sun_time_max',
       '各地_sun_time_min', '各地_mean_humid_mean', '各地_mean_humid_max',
       '各地_mean_humid_min'],
      dtype='object')
Index(['year', 'month', '全国_各地_mean_temp_mean_mean',
       '全国_各地_mean_temp_max_mean', '全国_各地_mean_temp_min_mean',
       '全国_各

In [6]:
# 出力内容確認作業：0であればOK
(tmp_df_test.to_numpy() != tmp_df.drop('area', axis = 1).to_numpy()).sum()

0

In [7]:
test2 = Weather()
res = test2.preprocess()
res

Unnamed: 0,year,month,全国_各地_mean_temp_mean_mean,全国_各地_mean_temp_max_mean,全国_各地_mean_temp_min_mean,全国_各地_max_temp_mean_mean,全国_各地_max_temp_max_mean,全国_各地_max_temp_min_mean,全国_各地_min_temp_mean_mean,全国_各地_min_temp_max_mean,全国_各地_min_temp_min_mean,全国_各地_sum_rain_mean_mean,全国_各地_sum_rain_max_mean,全国_各地_sum_rain_min_mean,全国_各地_sun_time_mean_mean,全国_各地_sun_time_max_mean,全国_各地_sun_time_min_mean,全国_各地_mean_humid_mean_mean,全国_各地_mean_humid_max_mean,全国_各地_mean_humid_min_mean
0,2004,11,13.046750,18.481251,8.406250,17.580376,22.950001,11.609375,9.046250,15.787500,4.287500,3.456875,42.578125,0.0,5.446625,9.478125,0.000000,67.877502,87.96875,49.59375
1,2004,12,8.011492,14.731250,1.662500,12.469456,21.259375,4.171875,4.086089,10.281250,-1.271875,3.554940,54.796875,0.0,4.952319,9.059375,0.000000,65.214722,85.18750,48.43750
2,2005,1,4.076210,7.975000,0.912500,8.123286,14.028125,3.909375,0.388609,4.356250,-3.300000,2.198085,26.328125,0.0,4.467498,8.940625,0.006250,64.005043,84.96875,46.59375
3,2005,2,4.199888,9.156250,0.143750,8.256920,14.599999,2.921875,0.549442,5.568750,-3.390625,2.989955,25.125000,0.0,4.314951,9.321875,0.000000,63.796875,88.56250,46.25000
4,2005,3,7.202218,12.068750,1.228125,11.891432,18.268749,4.409375,2.865927,8.950000,-2.656250,3.122984,29.046875,0.0,5.416835,10.934375,0.000000,61.677418,87.09375,41.53125
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
205,2021,12,6.865927,12.515625,0.768750,11.254233,17.865625,3.565625,2.880746,8.868750,-2.609375,2.730847,30.562500,0.0,5.176210,9.109375,0.062500,68.081650,89.53125,49.62500
206,2022,1,4.046069,7.256250,0.859375,8.298690,12.337500,3.906250,0.276109,4.681250,-3.653125,1.477319,20.171875,0.0,5.263206,9.231250,0.003125,64.570564,89.18750,49.43750
207,2022,2,4.209933,8.478125,0.787500,8.639286,14.621875,3.737500,0.331696,4.337500,-3.053125,1.676897,17.187500,0.0,5.717634,10.218750,0.012500,62.570312,87.65625,46.65625
208,2022,3,9.924496,15.990625,4.509375,15.028427,22.528126,7.868750,5.475907,11.756250,0.087500,3.296371,38.281250,0.0,5.885484,11.046875,0.009375,66.190521,89.40625,43.00000


In [8]:

(res.to_numpy() != tmp_df.drop('area', axis = 1).to_numpy()).sum(axis = 0)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [9]:
a = res['全国_各地_mean_temp_mean_mean'][tmp_df_test['全国_各地_mean_temp_mean_mean'] - res['全国_各地_mean_temp_mean_mean']>0]

In [10]:
b = tmp_df_test['全国_各地_mean_temp_mean_mean'][tmp_df_test['全国_各地_mean_temp_mean_mean'] - res['全国_各地_mean_temp_mean_mean']>0]

In [11]:
a - b

Series([], Name: 全国_各地_mean_temp_mean_mean, dtype: float32)