In [None]:
# データ加工・可視化用ライブラリのインポート
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# 学習データの読み込み
dir_path = "/content/drive/MyDrive/ColabDB/AustraliaWeather/"
train_df = pd.read_csv(dir_path + "train.csv")
test_df = pd.read_csv(dir_path + "test.csv")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# データの中身理解
train_df.head()

Unnamed: 0,id,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,train_00000,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,...,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,No
1,train_00001,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,...,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No,No
2,train_00002,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,No
3,train_00003,2008-12-06,Albury,14.6,29.7,0.2,,,WNW,56.0,...,55.0,23.0,1009.2,1005.4,,,20.6,28.9,No,No
4,train_00004,2008-12-07,Albury,14.3,25.0,0.0,,,W,50.0,...,49.0,19.0,1009.6,1008.2,1.0,,18.1,24.6,No,No


In [4]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84359 entries, 0 to 84358
Data columns (total 24 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             84359 non-null  object 
 1   Date           84359 non-null  object 
 2   Location       84359 non-null  object 
 3   MinTemp        84075 non-null  float64
 4   MaxTemp        84178 non-null  float64
 5   Rainfall       84359 non-null  float64
 6   Evaporation    48599 non-null  float64
 7   Sunshine       44347 non-null  float64
 8   WindGustDir    78846 non-null  object 
 9   WindGustSpeed  78883 non-null  float64
 10  WindDir9am     78571 non-null  object 
 11  WindDir3pm     82141 non-null  object 
 12  WindSpeed9am   83713 non-null  float64
 13  WindSpeed3pm   82843 non-null  float64
 14  Humidity9am    83444 non-null  float64
 15  Humidity3pm    82292 non-null  float64
 16  Pressure9am    76110 non-null  float64
 17  Pressure3pm    76098 non-null  float64
 18  Cloud9

In [5]:
train_df.isnull().sum()

Unnamed: 0,0
id,0
Date,0
Location,0
MinTemp,284
MaxTemp,181
Rainfall,0
Evaporation,35760
Sunshine,40012
WindGustDir,5513
WindGustSpeed,5476


In [6]:
## 学習用データと評価用データを縦結合する（評価用は「RainTomorrow」がNULL）
df = pd.concat([train_df, test_df], ignore_index=True)
df.tail(2)

Unnamed: 0,id,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
112550,test_28191,2017-06-15,Uluru,2.6,22.5,0.0,,,S,19.0,...,59.0,24.0,1025.0,1021.4,,,8.8,22.1,No,
112551,test_28192,2017-06-20,Uluru,3.5,21.8,0.0,,,E,31.0,...,59.0,27.0,1024.7,1021.2,,,9.4,20.9,No,


In [7]:
## データ加工（特徴量エンジニアリング）データフレーム＝df4

'''
特徴量として、以下を採用したものをdf4と定義する
Date、MinTemp、MaxTemp、Rainfall、
WindSpeed3pm、Humidity9am、Humidity3pm、Pressure3pm、RainToday、RainTomorrow
'''
df4 = df.drop(
    columns=['Location', 'Evaporation', 'Sunshine', 'WindGustDir', 'WindGustSpeed', 'WindDir9am', 'WindSpeed9am',
             'WindDir3pm', 'Pressure9am', 'Cloud9am' ,'Cloud3pm', 'Temp9am', 'Temp3pm']
    )

In [8]:
df4.head(2)

Unnamed: 0,id,Date,MinTemp,MaxTemp,Rainfall,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure3pm,RainToday,RainTomorrow
0,train_00000,2008-12-01,13.4,22.9,0.6,24.0,71.0,22.0,1007.1,No,No
1,train_00001,2008-12-02,7.4,25.1,0.0,22.0,44.0,25.0,1007.8,No,No


In [9]:
df4.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112552 entries, 0 to 112551
Data columns (total 11 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   id            112552 non-null  object 
 1   Date          112552 non-null  object 
 2   MinTemp       112177 non-null  float64
 3   MaxTemp       112306 non-null  float64
 4   Rainfall      112552 non-null  float64
 5   WindSpeed3pm  110533 non-null  float64
 6   Humidity9am   111335 non-null  float64
 7   Humidity3pm   109770 non-null  float64
 8   Pressure3pm   101541 non-null  float64
 9   RainToday     112552 non-null  object 
 10  RainTomorrow  84359 non-null   object 
dtypes: float64(7), object(4)
memory usage: 9.4+ MB


In [21]:
# Dateの月のみ抽出（MM)
from datetime import datetime
df4_1 = df4.copy()
df4_1['Date'] = pd.to_datetime(df4_1['Date'])
df4_1['Month'] = df4_1['Date'].dt.month

df4_1['Month'] = df4_1['Month'].astype(str)
df4_1.info()

df4_1 = df4_1.drop(
    columns=['Date']
    )

df4_1.head(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112552 entries, 0 to 112551
Data columns (total 12 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   id            112552 non-null  object        
 1   Date          112552 non-null  datetime64[ns]
 2   MinTemp       112177 non-null  float64       
 3   MaxTemp       112306 non-null  float64       
 4   Rainfall      112552 non-null  float64       
 5   WindSpeed3pm  110533 non-null  float64       
 6   Humidity9am   111335 non-null  float64       
 7   Humidity3pm   109770 non-null  float64       
 8   Pressure3pm   101541 non-null  float64       
 9   RainToday     112552 non-null  object        
 10  RainTomorrow  84359 non-null   object        
 11  Month         112552 non-null  object        
dtypes: datetime64[ns](1), float64(7), object(4)
memory usage: 10.3+ MB


Unnamed: 0,id,MinTemp,MaxTemp,Rainfall,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure3pm,RainToday,RainTomorrow,Month
0,train_00000,13.4,22.9,0.6,24.0,71.0,22.0,1007.1,No,No,12
1,train_00001,7.4,25.1,0.0,22.0,44.0,25.0,1007.8,No,No,12


In [22]:
# boolean型のRainTodayとRainTomorrowを「0,1」のint型に変更
df4_1['RainToday'] = df4_1['RainToday'].map({'Yes': 1, 'No': 0})
df4_1['RainTomorrow'] = df4_1['RainTomorrow'].map({'Yes': 1, 'No': 0})

In [23]:
df4_1.isnull().sum()

Unnamed: 0,0
id,0
MinTemp,375
MaxTemp,246
Rainfall,0
WindSpeed3pm,2019
Humidity9am,1217
Humidity3pm,2782
Pressure3pm,11011
RainToday,0
RainTomorrow,28193


In [24]:
# MinTempはMaxTempと相関があったため、MaxTemp-10度として欠損地を保管する
df4_1['MinTemp'] = df4_1['MinTemp'].fillna(df4_1['MaxTemp']-10)

# MaxTempはMinTempと相関があったため、MinTemp+10度として欠損地を保管する
df4_1['MaxTemp'] = df4_1['MaxTemp'].fillna(df4_1['MinTemp']+10)

In [25]:
# 残りのMaxTempとMinTempの欠損地は平均値で保管する
df4_1['MaxTemp'] = df4_1['MaxTemp'].fillna(df4_1['MaxTemp'].mean())
df4_1['MinTemp'] = df4_1['MinTemp'].fillna(df4_1['MinTemp'].mean())

In [26]:
# WindSpeed3pm は中央値で補完
df4_1['WindSpeed3pm'] = df4_1['WindSpeed3pm'].fillna(df4_1['WindSpeed3pm'].median())

# Humidity9pm は中央値で補完
df4_1['Humidity9am'] = df4_1['Humidity9am'].fillna(df4_1['Humidity9am'].median())

# Humidity3pm は中央値で補完
df4_1['Humidity3pm'] = df4_1['Humidity3pm'].fillna(df4_1['Humidity3pm'].median())

# Pressure3pmは中央値で補完（欠損値の数が多いのでやや不安）
df4_1['Pressure3pm'] = df4_1['Pressure3pm'].fillna(df4_1['Pressure3pm'].median())

In [27]:
df4_1.isnull().sum()

Unnamed: 0,0
id,0
MinTemp,0
MaxTemp,0
Rainfall,0
WindSpeed3pm,0
Humidity9am,0
Humidity3pm,0
Pressure3pm,0
RainToday,0
RainTomorrow,28193


In [28]:
# 数値データを正規化する（0〜1）
from sklearn.preprocessing import MinMaxScaler
df4_2 = df4_1.copy()
scaler = MinMaxScaler()

# MinTempの値を正規化する（最大値を１、最小値を０とする）
df4_2['MinTemp_normalized'] = scaler.fit_transform(df4_2[['MinTemp']])

# MaxTempの値を正規化する（最大値を１、最小値を０とする）
df4_2['MaxTemp_normalized'] = scaler.fit_transform(df4_2[['MaxTemp']])

# WindSpeed3pmの値を正規化する（最大値を１、最小値を０とする）
df4_2['WindSpeed3pm_normalized'] = scaler.fit_transform(df4_2[['WindSpeed3pm']])

# Humidity9pmの値を正規化する（最大値を１、最小値を０とする）
df4_2['Humidity9am_normalized'] = scaler.fit_transform(df4_2[['Humidity9am']])

# Humidity3pmの値を正規化する（最大値を１、最小値を０とする）
df4_2['Humidity3pm_normalized'] = scaler.fit_transform(df4_2[['Humidity3pm']])

# Pressure3pmの値を正規化する（最大値を１、最小値を０とする）
df4_2['Pressure3pm_normalized'] = scaler.fit_transform(df4_2[['Pressure3pm']])


In [30]:
# Rainfallの値を正規化する（最大値を１、最小値を０とする）
df4_2['Rainfall_normalized'] = scaler.fit_transform(df4_2[['Rainfall']])

In [31]:
df4_2.head()

Unnamed: 0,id,MinTemp,MaxTemp,Rainfall,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure3pm,RainToday,RainTomorrow,Month,MinTemp_normalized,MaxTemp_normalized,WindSpeed3pm_normalized,Humidity9am_normalized,Humidity3pm_normalized,Pressure3pm_normalized,Rainfall_normalized
0,train_00000,13.4,22.9,0.6,24.0,71.0,22.0,1007.1,0,0.0,12,0.513064,0.523629,0.275862,0.71,0.22,0.48,0.001617
1,train_00001,7.4,25.1,0.0,22.0,44.0,25.0,1007.8,0,0.0,12,0.370546,0.565217,0.252874,0.44,0.25,0.4912,0.0
2,train_00002,17.5,32.3,1.0,20.0,82.0,33.0,1006.0,0,0.0,12,0.610451,0.701323,0.229885,0.82,0.33,0.4624,0.002695
3,train_00003,14.6,29.7,0.2,24.0,55.0,23.0,1005.4,0,0.0,12,0.541568,0.652174,0.275862,0.55,0.23,0.4528,0.000539
4,train_00004,14.3,25.0,0.0,24.0,49.0,19.0,1008.2,0,0.0,12,0.534442,0.563327,0.275862,0.49,0.19,0.4976,0.0


In [32]:
df4_2 = df4_2.drop(
    columns=['MinTemp', 'MaxTemp', 'Rainfall', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm', 'Pressure3pm']
    )

df4_2.head()

Unnamed: 0,id,RainToday,RainTomorrow,Month,MinTemp_normalized,MaxTemp_normalized,WindSpeed3pm_normalized,Humidity9am_normalized,Humidity3pm_normalized,Pressure3pm_normalized,Rainfall_normalized
0,train_00000,0,0.0,12,0.513064,0.523629,0.275862,0.71,0.22,0.48,0.001617
1,train_00001,0,0.0,12,0.370546,0.565217,0.252874,0.44,0.25,0.4912,0.0
2,train_00002,0,0.0,12,0.610451,0.701323,0.229885,0.82,0.33,0.4624,0.002695
3,train_00003,0,0.0,12,0.541568,0.652174,0.275862,0.55,0.23,0.4528,0.000539
4,train_00004,0,0.0,12,0.534442,0.563327,0.275862,0.49,0.19,0.4976,0.0


In [33]:
# object型のMonthをワンホットエンコーディングでint型に変更
ohe_Mo = pd.get_dummies(df4_2['Month'], dtype=int, prefix='Mo')
ohe_Mo.head()

Unnamed: 0,Mo_1,Mo_10,Mo_11,Mo_12,Mo_2,Mo_3,Mo_4,Mo_5,Mo_6,Mo_7,Mo_8,Mo_9
0,0,0,0,1,0,0,0,0,0,0,0,0
1,0,0,0,1,0,0,0,0,0,0,0,0
2,0,0,0,1,0,0,0,0,0,0,0,0
3,0,0,0,1,0,0,0,0,0,0,0,0
4,0,0,0,1,0,0,0,0,0,0,0,0


In [34]:
# int型にエンコーディングしたMonthを横結合し、object型のMonthを削除
df4_2 = pd.concat([df4_2, ohe_Mo], axis=1)
df4_2 = df4_2.drop(columns=['Month'])
df4_2.head()

Unnamed: 0,id,RainToday,RainTomorrow,MinTemp_normalized,MaxTemp_normalized,WindSpeed3pm_normalized,Humidity9am_normalized,Humidity3pm_normalized,Pressure3pm_normalized,Rainfall_normalized,...,Mo_11,Mo_12,Mo_2,Mo_3,Mo_4,Mo_5,Mo_6,Mo_7,Mo_8,Mo_9
0,train_00000,0,0.0,0.513064,0.523629,0.275862,0.71,0.22,0.48,0.001617,...,0,1,0,0,0,0,0,0,0,0
1,train_00001,0,0.0,0.370546,0.565217,0.252874,0.44,0.25,0.4912,0.0,...,0,1,0,0,0,0,0,0,0,0
2,train_00002,0,0.0,0.610451,0.701323,0.229885,0.82,0.33,0.4624,0.002695,...,0,1,0,0,0,0,0,0,0,0
3,train_00003,0,0.0,0.541568,0.652174,0.275862,0.55,0.23,0.4528,0.000539,...,0,1,0,0,0,0,0,0,0,0
4,train_00004,0,0.0,0.534442,0.563327,0.275862,0.49,0.19,0.4976,0.0,...,0,1,0,0,0,0,0,0,0,0


In [35]:
df4_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112552 entries, 0 to 112551
Data columns (total 22 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   id                       112552 non-null  object 
 1   RainToday                112552 non-null  int64  
 2   RainTomorrow             84359 non-null   float64
 3   MinTemp_normalized       112552 non-null  float64
 4   MaxTemp_normalized       112552 non-null  float64
 5   WindSpeed3pm_normalized  112552 non-null  float64
 6   Humidity9am_normalized   112552 non-null  float64
 7   Humidity3pm_normalized   112552 non-null  float64
 8   Pressure3pm_normalized   112552 non-null  float64
 9   Rainfall_normalized      112552 non-null  float64
 10  Mo_1                     112552 non-null  int64  
 11  Mo_10                    112552 non-null  int64  
 12  Mo_11                    112552 non-null  int64  
 13  Mo_12                    112552 non-null  int64  
 14  Mo_2

In [36]:
# dfを学習用データと評価用データ（RainTomorrowがNULL）に分ける（train_df4、test_df4）
train_df4 = df4_2[~df4_2['RainTomorrow'].isnull()]
test_df4 = df4_2[df4_2['RainTomorrow'].isnull()]

In [37]:
train_df4.tail()

Unnamed: 0,id,RainToday,RainTomorrow,MinTemp_normalized,MaxTemp_normalized,WindSpeed3pm_normalized,Humidity9am_normalized,Humidity3pm_normalized,Pressure3pm_normalized,Rainfall_normalized,...,Mo_11,Mo_12,Mo_2,Mo_3,Mo_4,Mo_5,Mo_6,Mo_7,Mo_8,Mo_9
84354,train_84354,0,0.0,0.346793,0.533081,0.195402,0.53,0.25,0.7344,0.0,...,0,0,0,0,0,0,1,0,0,0
84355,train_84355,0,0.0,0.384798,0.482042,0.298851,0.56,0.32,0.7552,0.0,...,0,0,0,0,0,0,1,0,0,0
84356,train_84356,0,0.0,0.261283,0.533081,0.126437,0.51,0.24,0.6912,0.0,...,0,0,0,0,0,0,1,0,0,0
84357,train_84357,0,0.0,0.280285,0.568998,0.103448,0.56,0.21,0.672,0.0,...,0,0,0,0,0,0,1,0,0,0
84358,train_84358,0,0.0,0.32304,0.599244,0.103448,0.53,0.24,0.6352,0.0,...,0,0,0,0,0,0,1,0,0,0


In [38]:
test_df4.head()

Unnamed: 0,id,RainToday,RainTomorrow,MinTemp_normalized,MaxTemp_normalized,WindSpeed3pm_normalized,Humidity9am_normalized,Humidity3pm_normalized,Pressure3pm_normalized,Rainfall_normalized,...,Mo_11,Mo_12,Mo_2,Mo_3,Mo_4,Mo_5,Mo_6,Mo_7,Mo_8,Mo_9
84359,test_00000,0,,0.413302,0.620038,0.103448,0.45,0.16,0.5712,0.0,...,0,1,0,0,0,0,0,0,0,0
84360,test_00001,0,,0.425178,0.693762,0.321839,0.42,0.09,0.424,0.0,...,0,1,0,0,0,0,0,0,0,0
84361,test_00002,1,,0.494062,0.487713,0.229885,0.65,0.43,0.3952,0.009704,...,0,1,0,0,0,0,0,0,0,0
84362,test_00003,1,,0.460808,0.516068,0.195402,0.47,0.32,0.5216,0.028571,...,0,1,0,0,0,0,0,0,0,0
84363,test_00004,0,,0.558195,0.674858,0.149425,0.55,0.23,0.4976,0.0,...,0,1,0,0,0,0,0,0,0,0


In [39]:
# 学習用データを説明変数（x_train）と目的変数（y_train）に分ける
y_train = train_df4['RainTomorrow']
x_train = train_df4.drop(columns=['RainTomorrow', 'id'])

In [40]:
x_train.head()

Unnamed: 0,RainToday,MinTemp_normalized,MaxTemp_normalized,WindSpeed3pm_normalized,Humidity9am_normalized,Humidity3pm_normalized,Pressure3pm_normalized,Rainfall_normalized,Mo_1,Mo_10,Mo_11,Mo_12,Mo_2,Mo_3,Mo_4,Mo_5,Mo_6,Mo_7,Mo_8,Mo_9
0,0,0.513064,0.523629,0.275862,0.71,0.22,0.48,0.001617,0,0,0,1,0,0,0,0,0,0,0,0
1,0,0.370546,0.565217,0.252874,0.44,0.25,0.4912,0.0,0,0,0,1,0,0,0,0,0,0,0,0
2,0,0.610451,0.701323,0.229885,0.82,0.33,0.4624,0.002695,0,0,0,1,0,0,0,0,0,0,0,0
3,0,0.541568,0.652174,0.275862,0.55,0.23,0.4528,0.000539,0,0,0,1,0,0,0,0,0,0,0,0
4,0,0.534442,0.563327,0.275862,0.49,0.19,0.4976,0.0,0,0,0,1,0,0,0,0,0,0,0,0


In [41]:
from sklearn.model_selection import train_test_split

# 学習用データをモデル作成用（_tr）とモデル評価用（_va）に分ける（パラstratifyは目的変数の値の割合を均等にする）
x_tr, x_va, y_tr, y_va = train_test_split(x_train, y_train, test_size=0.2, stratify=y_train, random_state=42)

# 分析用ライブラリインポート
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

In [45]:
# モデル作成と予測データに対して予測を実施２（ランダムフォレスト）
model2 = RandomForestClassifier(random_state=42)
model2.fit(x_train, y_train)
y_va_pred2 = model2.predict(x_train)

accuracy_va2 = accuracy_score(y_train, y_va_pred2)

print(accuracy_va2)

1.0


In [43]:
# モデル作成と予測データに対して予測を実施３（勾配ブースティング）
model3 = GradientBoostingClassifier()
model3.fit(x_tr, y_tr)
y_tr_pred3 = model3.predict(x_tr)
y_va_pred3 = model3.predict(x_va)
accuracy_tr3 = accuracy_score(y_tr, y_tr_pred3)
accuracy_va3 = accuracy_score(y_va, y_va_pred3)
print(accuracy_tr3)
print(accuracy_va3)

0.8447701038718568
0.8458985301090565


In [46]:
# 評価用データにモデル２を適用し、提出用ファイルの作成を実施
test_df9 = test_df4.copy()
x_test = test_df4.drop(columns=['id', 'RainTomorrow'])
y_test_pred = model2.predict(x_test)

# 評価用データに予測した「y_test_pred」を切り取って、提出用のフィーマットを作成する
test_df9['RainTomorrow'] = y_test_pred
submit_df = test_df9[['id', 'RainTomorrow']].set_index('id')

# 評価用データの「RainTomorrow」をBoolean型に戻す（submit_df）
submit_df['RainTomorrow'] = submit_df['RainTomorrow'].map({1.:'Yes', 0.: 'No'})

# 提出用にCSVファイルに変換して、マイドライブに格納する
submit_df.to_csv(dir_path + 'submit/' + 'submit_df9.csv', header = False)