In [56]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from datetime import datetime
from datetime import timedelta

from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

from xgboost import XGBRegressor

import warnings

warnings.filterwarnings("ignore")
%matplotlib inline

In [57]:
# 한글 깨짐
import platform
if platform.system() == 'Darwin': #맥
        plt.rc('font', family='AppleGothic') 
elif platform.system() == 'Windows': #윈도우
        plt.rc('font', family='Malgun Gothic') 
elif platform.system() == 'Linux': #리눅스 (구글 콜랩)
        #!wget "https://www.wfonts.com/download/data/2016/06/13/malgun-gothic/malgun.ttf"
        #!mv malgun.ttf /usr/share/fonts/truetype/
        #import matplotlib.font_manager as fm 
        #fm._rebuild() 
        plt.rc('font', family='Malgun Gothic') 
plt.rcParams['axes.unicode_minus'] = False #한글 폰트 사용시 마이너스 폰트 깨짐 해결

In [2]:
train = pd.read_csv("../1.data/train.csv")
test = pd.read_csv("../1.data/test.csv")
submit = pd.read_csv("../1.data/sample_submission.csv")

## 식수 인원 파생변수 생성

In [44]:
# 전주 식수 인원
# test set에서는 구할 수 없다.
def previous_week(x):
    stamp = datetime.strptime(x, "%Y-%m-%d")
    day = stamp - timedelta(7)
    a = day.strftime("%Y-%m-%d")
    return a

train["전주일자"] = train["일자"].apply(lambda x: previous_week(x))
test["전주일자"] = test["일자"].apply(lambda x: previous_week(x))

In [65]:
# 요일 별 식수 인원(평균)
# test set에서는 구할 수 없다.
df = train.groupby("요일")[["중식계", "석식계"]].agg("mean").reset_index()

train["요일별중식계"] = 0
train["요일별석식계"] = 0

for idx in range(len(df)):
    day = train.loc[idx, "요일"]
    value = df[df["요일"] == day]
    train.loc[idx, "요일별중식계"] = value["중식계"].values[0]
    train.loc[idx, "요일별석식계"] = value["석식계"].values[0]

In [None]:
# 상관분석
plt.figure(figsize = (15, 8))
plt.xticks(color = "white", fontsize = 15)
plt.yticks(color = "white", fontsize = 15)
sns.heatmap(train.corr(), annot = True)

In [None]:
# 월 별 식수 인원(평균)
# test set에서는 구할 수 없다.

In [7]:
# 실근무자 수
train["실근무자수"] = train["본사정원수"] - train["본사출장자수"] - train["본사휴가자수"] - train["현본사소속재택근무자수"]
test["실근무자수"] = test["본사정원수"] - test["본사출장자수"] - test["본사휴가자수"] - test["현본사소속재택근무자수"]

In [49]:
train['요일'] = train['요일'].map({'월':0, '화':1, '수':2, '목':3, '금':4})
test['요일'] = test['요일'].map({'월':0, '화':1, '수':2, '목':3, '금':4})

## 날씨 데이터 전처리

In [74]:
weather2016 = pd.read_csv("../1.data/2016_weather.csv", encoding="euc-kr")

In [77]:
weather2016.columns

Index(['지점', '지점명', '일시', '기온(°C)', '기온 QC플래그', '강수량(mm)', '강수량 QC플래그',
       '풍속(m/s)', '풍속 QC플래그', '풍향(16방위)', '풍향 QC플래그', '습도(%)', '습도 QC플래그',
       '증기압(hPa)', '이슬점온도(°C)', '현지기압(hPa)', '현지기압 QC플래그', '해면기압(hPa)',
       '해면기압 QC플래그', '일조(hr)', '일조 QC플래그', '일사(MJ/m2)', '적설(cm)', '3시간신적설(cm)',
       '전운량(10분위)', '중하층운량(10분위)', '운형(운형약어)', '최저운고(100m )', '시정(10m)',
       '지면상태(지면상태코드)', '현상번호(국내식)', '지면온도(°C)', '지면온도 QC플래그', '5cm 지중온도(°C)',
       '10cm 지중온도(°C)', '20cm 지중온도(°C)', '30cm 지중온도(°C)'],
      dtype='object')

In [50]:
x_train = train[['요일', '본사정원수', '본사출장자수', '본사시간외근무명령서승인건수', '현본사소속재택근무자수',
                "실근무자수"]]
y1_train = train['중식계']
y2_train = train['석식계']

x_test = test[['요일', '본사정원수', '본사출장자수', '본사시간외근무명령서승인건수', '현본사소속재택근무자수',
              "실근무자수"]]

In [51]:
model1 = XGBRegressor(n_jobs=-1, random_state=42)
model2 = XGBRegressor(n_jobs=-1, random_state=42)

In [52]:
model1.fit(x_train, y1_train)
model2.fit(x_train, y2_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=-1, num_parallel_tree=1, random_state=42,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [53]:
pred1 = model1.predict(x_test)
pred2 = model2.predict(x_test)

In [54]:
submit['중식계'] = pred1
submit['석식계'] = pred2

In [55]:
submit.to_csv('baseline.csv', index=False)