# 想要預測當日會不會下雨
featuers：
* 當日最低溫度
* 當日最低氣壓
* 濕度
* 風速

假設：
下雨條件只看這這四種，且發生時都在同一時間。

溫度 < 25(攝氏)

壓力 < 1000(hpa)

濕度 > 60(相對濕度)

風速 > 10(km/hr)

數值範圍：

溫度：15~30

氣壓：980~1030

濕度：30~80

風速：0~25

最多有 $10^6$ 種可能，全部條列太多了

所以用隨機產生 1000 個 data 去跑

In [4]:
import pandas as pd
import numpy
from sklearn.tree import DecisionTreeClassifier
from sklearn.externals.six import StringIO   
from sklearn.tree import export_graphviz
import pydotplus 

In [89]:
temp_df = pd.DataFrame(numpy.random.randint(15,30,size=(1000, 1)), columns=['min_temp'])
press_df = pd.DataFrame(numpy.random.randint(980,1030,size=(1000, 1)), columns=['min_press'])
humidity_df = pd.DataFrame(numpy.random.randint(30,90,size=(1000, 1)), columns=['humidity'])
wind_df = pd.DataFrame(numpy.random.randint(0,25,size=(1000, 1)), columns=['wind_speed'])

In [90]:
df = temp_df
df.insert(1, 'min_press', press_df)
df.insert(2, 'humidity', humidity_df)
df.insert(3, 'wind_speed', wind_df)
df.head(10)

Unnamed: 0,min_temp,min_press,humidity,wind_speed
0,28,985,85,24
1,25,1026,36,5
2,29,1028,54,5
3,27,1008,45,1
4,28,1021,74,11
5,16,992,79,17
6,20,1008,73,20
7,26,986,58,6
8,19,1013,77,24
9,22,1026,71,9


In [94]:
cnt = 0
ret = pd.DataFrame(numpy.zeros(1000), columns=['rain'])
for i in range(0, 999):
    if(df['min_temp'][i] < 25 and df['min_press'][i] < 1010 and df['humidity'][i] > 60 and df['wind_speed'][i] > 5):
        ret['rain'][i] = 1
cnt

0

In [97]:
ret.head(10)

Unnamed: 0,rain
0,0.0
1,0.0
2,0.0
3,0.0
4,0.0
5,1.0
6,1.0
7,0.0
8,0.0
9,0.0


In [98]:
df_train = df[:700]
df_train.head()

Unnamed: 0,min_temp,min_press,humidity,wind_speed
0,28,985,85,24
1,25,1026,36,5
2,29,1028,54,5
3,27,1008,45,1
4,28,1021,74,11


In [99]:
df_test = df[700:]
df_test.head()

Unnamed: 0,min_temp,min_press,humidity,wind_speed
700,21,990,50,4
701,24,998,75,10
702,19,1012,54,13
703,21,985,78,10
704,26,1007,69,20


In [111]:
y = ret['rain'].values

y_train = y[:700]
y_test = y[700:]

In [107]:
dtree=DecisionTreeClassifier(max_depth=4)
dtree.fit(df_train,y_train)

dot_data = StringIO()
export_graphviz(dtree, 
                out_file=dot_data,  
                filled=True, 
                feature_names=list(df_train),
                class_names=['sun','rain'],
                special_characters=True)

graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
graph.write_pdf("tree.pdf")

True

In [108]:
dtree.feature_importances_

array([0.24826938, 0.31629488, 0.17871608, 0.25671967])

In [109]:
y_predict = dtree.predict(df_test)

y_predict

array([0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 1., 1., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
       1., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 1., 1., 0., 0.,
       0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 1., 0., 1., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 1., 0., 0., 0.,
       1., 0., 0., 0., 1., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 1., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
       1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 1.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 1.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0.,
       1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
       0., 0., 1., 0., 0.

In [112]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_predict)

1.0