In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook
from datetime import datetime, timedelta
from sklearn.metrics import mean_squared_log_error
import warnings; warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from plotly.offline import iplot
from plotly import tools
import plotly.graph_objects as go
import plotly.express as px
import plotly.offline as py
import plotly.figure_factory as ff
py.init_notebook_mode(connected=True)

## ml
import xgboost as xgb

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
train_df = pd.read_csv('/kaggle/input/covid19-global-forecasting-week-1/train.csv')
test_df = pd.read_csv('/kaggle/input/covid19-global-forecasting-week-1/test.csv')
submit_df = pd.read_csv('/kaggle/input/covid19-global-forecasting-week-1/submission.csv')

In [None]:
submit_df

In [None]:
train_df

In [None]:
print(test_df.head())
print(train_df.head())
print(submit_df.head())

## 目的
入力情報から、感染と死亡するかの両方を予測する.
- train.csv : 2020年3月18日までのトレーニングデータ。
- test.csv : 予測する日付。最初の公開リーダーボードのトレーニングデータと1週間の重複があります。
    

入力情報
- Province/State : 住んでいる地域
- Country/Region :　住んでいる国
- Lat : 緯度
- Long : 経度
- Date : 確認した日付
- ConfirmedCases : その日の感染者の数
- Fatalities : その日の死亡者数

In [None]:
## まずはxgboostで精度を見てみる

In [None]:
print(train_df['Province/State'].unique())
print(train_df['Country/Region'].unique())

In [None]:
print(train_df.isnull().any())

In [None]:
## xgboostでも文字は処理できないので、Province/StateとCountry/Regionをカテゴリ変数に変換する
## label encoderを使う
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
cr = le.fit(train_df['Country/Region'])
cr_df = cr.transform(train_df['Country/Region'])

In [None]:
train_df['cr_l'] = cr_df
train_df_cr = train_df.copy()

train_df_cr

In [None]:
train_df_cr['Date'] = pd.to_datetime(train_df_cr['Date'])

In [None]:
month_lists = []
weekday_lists = []
day_lists = []

## 日付データを処理する
for index, row in train_df_cr.iterrows():
    row_month = row['Date'].month
    row_weekday = row['Date'].dayofweek
    row_day = row['Date'].day

    month_lists.append(row_month)
    weekday_lists.append(row_weekday)
    day_lists.append(row_day)

In [None]:
train_df_cr_d = train_df_cr.assign(month=month_lists, weekday=weekday_lists, day=day_lists)

In [None]:
train_df_cr_d

In [None]:
## いらないカラムを捨てる
train_df_cr_d_d = train_df_cr_d.copy().drop(['Province/State', 'Country/Region', 'Date'], axis=1)

## 感染者と死亡者は別々で予測する
cc_df = train_df_cr_d_d.drop('Fatalities', axis=1)
fa_df = train_df_cr_d_d.drop('ConfirmedCases', axis=1)

In [None]:
## まずは、感染者の予測から
cc_df_X = cc_df.copy().drop('ConfirmedCases', axis=1)

## データセット分割
cc_x_train, cc_x_val, cc_y_train, cc_y_val = train_test_split(cc_df_X, cc_df['ConfirmedCases'], random_state=0)

In [None]:
cc_reg = xgb.XGBRegressor()
cc_reg.fit(cc_x_train, cc_y_train)

# 学習モデルの評価
cc_pred_train = cc_reg.predict(cc_x_train)
cc_pred_val = cc_reg.predict(cc_x_val)

In [None]:
## マイナス値を0にする
cc_pred_train_c = np.clip(cc_pred_train, 0, None)
cc_pred_val_c = np.clip(cc_pred_val, 0, None)

In [None]:
cc_pred_train_c

In [None]:
## 評価
print(mean_squared_log_error(cc_y_train, cc_pred_train_c))
print(mean_squared_log_error(cc_y_val, cc_pred_val_c))

In [None]:
## 死亡者の予測
fa_df_X = fa_df.copy().drop('Fatalities', axis=1)

## データセット分割
fa_x_train, fa_x_val, fa_y_train, fa_y_val = train_test_split(fa_df_X, fa_df['Fatalities'], random_state=0)

In [None]:
fa_reg = xgb.XGBRegressor()
fa_reg.fit(fa_x_train, fa_y_train)

# 学習モデルの評価
fa_pred_train = fa_reg.predict(fa_x_train)
fa_pred_val = fa_reg.predict(fa_x_val)

In [None]:
## マイナス値を0にする
fa_pred_train_c = np.clip(fa_pred_train, 0, None)
fa_pred_val_c = np.clip(fa_pred_val, 0, None)

In [None]:
## 評価
print(mean_squared_log_error(fa_y_train, fa_pred_train_c))
print(mean_squared_log_error(fa_y_val, fa_pred_val_c))

In [None]:
## submittion作成する

def preprocess(dataset):
    # カラム名を修正
    dataset = dataset.rename(columns={'ForecastId': 'Id'})
    
    # ラベル化
    cr_df = cr.transform(dataset['Country/Region'])
    dataset['cr_l'] = cr_df
    
    # 日付に変換
    dataset['Date'] = pd.to_datetime(dataset['Date'])
    
    # 日付をそれぞれの形に変換
    month_lists = []
    weekday_lists = []
    day_lists = []

    ## 日付データを処理する
    for index, row in dataset.iterrows():
        row_month = row['Date'].month
        row_weekday = row['Date'].dayofweek
        row_day = row['Date'].day

        month_lists.append(row_month)
        weekday_lists.append(row_weekday)
        day_lists.append(row_day)
    
    dataset = dataset.assign(month=month_lists, weekday=weekday_lists, day=day_lists)
    
    # いらないカラムを削除する     
    dataset_d = dataset.copy().drop(['Province/State', 'Country/Region', 'Date'], axis=1)
    
    return dataset_d


In [None]:
test_df_p = preprocess(test_df)
test_df_p

In [None]:
# 予測してくっつける
cc_pred = cc_reg.predict(test_df_p)
fa_pred = fa_reg.predict(test_df_p)

## マイナス値を0にする&小数点以下を切り捨てする
cc_pred_p = np.clip(cc_pred, 0, None).round().astype(int)
fa_pred_p = np.clip(fa_pred, 0, None).round().astype(int)

In [None]:
final_df = pd.DataFrame(test_df['ForecastId'].copy())
final_df = final_df.assign(ConfirmedCases=cc_pred_p, Fatalities=fa_pred_p)
final_df

In [None]:
final_df.to_csv('submission.csv', index=False)

In [None]:
## ここからもってきた。こんなんできるんだ。すごい
## https://www.kaggle.com/pradeepmuniasamy/covid19-inside-story-of-each-countries

temp = train_df.groupby(['Date', 'Country/Region'])['ConfirmedCases'].sum().reset_index()
temp['Date'] = pd.to_datetime(temp['Date'])
temp['Date'] = temp['Date'].dt.strftime('%m/%d/%Y')
temp['size'] = temp['ConfirmedCases'].pow(0.3) * 3.5

fig = px.scatter_geo(temp, locations="Country/Region", locationmode='country names', 
                     color="ConfirmedCases", size='size', hover_name="Country/Region", 
                     range_color=[1,100],
                     projection="natural earth", animation_frame="Date", 
                     title='COVID-19: Cases Over Time', color_continuous_scale="greens")
fig.show()

In [None]:
temp