In [None]:
!pip install meteocalc

In [None]:
# 必要なモジュールのimport
import numpy as np
import pandas as pd
pd.set_option('max_columns', 150) # pandas dataframe表示列数の設定
import gc

import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib.patches as patches

import seaborn as sns
from plotly import tools, subplots
import plotly.offline as py
py.init_notebook_mode(connected=True)
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import plotly.express as px
import plotly.graph_objs as go
import datetime

from meteocalc import feels_like, Temp
import os, random, math, psutil, pickle

# kaggle kernelの利用を想定しています。
# ローカルなど、別環境で作業する場合はファイルを置いたディレクトリを指定してください
print(os.listdir('../input/ashrae-energy-prediction/')) 

# 今回はトレーニングデータのみで進めていきます
train_df = pd.read_csv('../input/ashrae-energy-prediction/train.csv')
weather_train_df = pd.read_csv('../input/ashrae-energy-prediction/weather_train.csv')
building_meta_df = pd.read_csv('../input/ashrae-energy-prediction/building_metadata.csv')

# train_df['timestamp'] = pd.to_datetime(train_df['timestamp'])
# weather_train_df['timestamp'] = pd.to_datetime(weather_train_df['timestamp'])

In [None]:
train_df['meter_reading'] = np.log1p(train_df['meter_reading'])

In [None]:
def fill_weather_dataset(weather_df):
    time_format = "%Y-%m-%d %H:%M:%S"
    start_date = datetime.datetime.strptime(weather_df["timestamp"].min(), time_format)
    end_date = datetime.datetime.strptime(weather_df["timestamp"].max(), time_format)
    total_hours = int(((end_date - start_date).total_seconds() + 3600)/3600)
    hours_list = [(end_date - datetime.timedelta(hours=x)).strftime(time_format) for x in range(total_hours)]
    missing_hours = []
    for site_id in range(16):
        site_hours = np.array(weather_df[weather_df["site_id"]==site_id]["timestamp"])
        new_rows = pd.DataFrame(np.setdiff1d(hours_list, site_hours), columns=["timestamp"])
        new_rows["site_id"] = site_id
        weather_df = pd.concat([weather_df, new_rows])
        
        weather_df = weather_df.reset_index(drop=True)
        
    weather_df["datetime"] = pd.to_datetime(weather_df["timestamp"])
    weather_df["day"] = weather_df["datetime"].dt.day
    weather_df["week"] = weather_df["datetime"].dt.week
    weather_df["month"] = weather_df["datetime"].dt.month
    
    weather_df = weather_df.set_index(["site_id", "day", "month"])
    air_temperature_filler = pd.DataFrame(weather_df.groupby(["site_id", "day", "month"])["air_temperature"].mean(), columns=["air_temperature"])
    weather_df.update(air_temperature_filler, overwrite=False)
    
    cloud_coverage_filler = weather_df.groupby(["site_id", "day", "month"])["cloud_coverage"].mean()
    cloud_coverage_filler = pd.DataFrame(cloud_coverage_filler.fillna(method="ffill"), columns=["cloud_coverage"])
    weather_df.update(cloud_coverage_filler, overwrite=False)
    
    dew_temperature_filler = pd.DataFrame(weather_df.groupby(["site_id", "day", "month"])["dew_temperature"].mean(), columns=["dew_temperature"])
    weather_df.update(dew_temperature_filler, overwrite=False)
    
    sea_level_filler = weather_df.groupby(["site_id", "day", "month"])["sea_level_pressure"].mean()
    sea_level_filler = pd.DataFrame(sea_level_filler.fillna(method="ffill"), columns=["sea_level_pressure"])
    
    weather_df.update(sea_level_filler, overwrite=False)
    
    wind_direction_filler = pd.DataFrame(weather_df.groupby(["site_id", "day", "month"])["wind_direction"].mean(), columns=["wind_direction"])
    weather_df.update(wind_direction_filler, overwrite=False)
    
    wind_speed_filler = pd.DataFrame(weather_df.groupby(["site_id", "day", "month"])["wind_speed"].mean(), columns=["wind_speed"])
    weather_df.update(wind_speed_filler, overwrite=False)
    
    precip_depth_filler = weather_df.groupby(["site_id", "day", "month"])["precip_depth_1_hr"].mean()
    precip_depth_filler = pd.DataFrame(precip_depth_filler.fillna(method="ffill"), columns=["precip_depth_1_hr"])
    weather_df.update(precip_depth_filler, overwrite=False)
    
    weather_df = weather_df.reset_index()
    weather_df = weather_df.drop(["datetime", "day", "week", "month"], axis=1)
    
    return weather_df

In [None]:
# 使い方は以下の通りです
weather_train_df = fill_weather_dataset(weather_train_df)

In [None]:
def get_meteorological_features(data):
        def calculate_rh(df):
            df['relative_humidity'] = 100 * (np.exp((17.625 * df['dew_temperature']) / (243.04 + df['dew_temperature'])) / np.exp((17.625 * df['air_temperature'])/(243.04 + df['air_temperature'])))
        def calculate_fl(df):
            flike_final = []
            flike = []
            # calculate Feels Like temperature
            for i in range(len(df)):
                at = df['air_temperature'][i]
                rh = df['relative_humidity'][i]
                ws = df['wind_speed'][i]
                flike.append(feels_like(Temp(at, unit = 'C'), rh, ws))
            for i in range(len(flike)):
                flike_final.append(flike[i].f)
            df['feels_like'] = flike_final
            del flike_final, flike, at, rh, ws
        calculate_rh(data)
        calculate_fl(data)
        return data

In [None]:
# 以下のように関数を呼び出すことで、特徴量が生成されます（最終2列が生成した特徴量）
weather_train_df = get_meteorological_features(weather_train_df)
weather_train_df.head()

In [None]:
# "building_id"と"meter"の値をstr型で結合して、"building_meter"カラムとします
# 例: building_id=1111, meter=0→building_meter=1111_0
train_df['building_meter'] = list(map(lambda x, y: str(x) + '_' + str(y), train_df['building_id'], train_df['meter']))

# 作成した"building_meter"カラムの値をuniqueで重複なく取り出しリスト化
all_building_meter = list(train_df['building_meter'].unique())
# "building_meter"の値と対応する数値を結びつける辞書を作成
# 例: 1111_0 → 50(値はてきとうです)
building_meter_map = dict(zip(all_building_meter, np.arange(len(all_building_meter))))

# 作成した辞書を用いて、building_meterの値をマッピングする処理をします
train_df['building_meter_category'] = train_df['building_meter'].map(building_meter_map)


In [None]:
# 'building_meter'カラムには'building_id'と'meter'を文字列結合したものが格納され、
# その値と一対一に対応する数値が'building_meter_category'に格納されています
train_df.head()

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
building_meta_df["primary_use"] = le.fit_transform(building_meta_df["primary_use"])

In [None]:
# 先にテーブルを結合しておきます
train_df = train_df.merge(building_meta_df, left_on="building_id", right_on="building_id", how="left")
train_df = train_df.merge(weather_train_df, how="left", left_on=["site_id", "timestamp"], right_on=["site_id", "timestamp"])

In [None]:
# トレーニングデータの"meter_reading"の値を"target"変数で保持
train_df["meter_reading"] = np.log1p(train_df["meter_reading"])

# また、学習に用いるデータは、目的変数の"meter_reading"をはじめ、使用しないカラム名を指定して、削除します
train_df = train_df.drop(["timestamp", "year_built", "floor_count", "sea_level_pressure", "wind_direction", "wind_speed", "building_meter"], axis=1)

In [None]:
train_df.to_csv('train_df.csv', index=False)