In [1]:
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

# read data

In [2]:
data = pd.read_csv('./daily_weather.csv')
data.head()

Unnamed: 0,number,air_pressure_9am,air_temp_9am,avg_wind_direction_9am,avg_wind_speed_9am,max_wind_direction_9am,max_wind_speed_9am,rain_accumulation_9am,rain_duration_9am,relative_humidity_9am,relative_humidity_3pm
0,0,918.06,74.822,271.1,2.080354,295.4,2.863283,0.0,0.0,42.42,36.16
1,1,917.347688,71.403843,101.935179,2.443009,140.471548,3.533324,0.0,0.0,24.328697,19.426597
2,2,923.04,60.638,51.0,17.067852,63.7,22.100967,0.0,20.0,8.9,14.46
3,3,920.502751,70.138895,198.832133,4.337363,211.203341,5.190045,0.0,0.0,12.189102,12.742547
4,4,921.16,44.294,277.8,1.85666,136.5,2.863283,8.9,14730.0,92.41,76.74


In [3]:
data.columns

Index(['number', 'air_pressure_9am', 'air_temp_9am', 'avg_wind_direction_9am',
       'avg_wind_speed_9am', 'max_wind_direction_9am', 'max_wind_speed_9am',
       'rain_accumulation_9am', 'rain_duration_9am', 'relative_humidity_9am',
       'relative_humidity_3pm'],
      dtype='object')

In [4]:
data.shape

(1095, 11)

# clean data

In [10]:
data[data.isnull().any(axis=1)].shape

(31, 11)

In [11]:
del data['number']

In [12]:
data.columns

Index(['air_pressure_9am', 'air_temp_9am', 'avg_wind_direction_9am',
       'avg_wind_speed_9am', 'max_wind_direction_9am', 'max_wind_speed_9am',
       'rain_accumulation_9am', 'rain_duration_9am', 'relative_humidity_9am',
       'relative_humidity_3pm'],
      dtype='object')

In [13]:
data = data.dropna()

In [14]:
data.shape

(1064, 10)

# classification task

In [15]:
clean_data = data.copy()
clean_data['high_humi_label'] = (clean_data['relative_humidity_3pm'] > 24.99) * 1
# target label
clean_data['high_humi_label']

0       1
1       0
2       0
3       0
4       1
       ..
1090    1
1091    1
1092    1
1093    1
1094    0
Name: high_humi_label, Length: 1064, dtype: int64

In [21]:
y = clean_data[['high_humi_label']].copy()
y

Unnamed: 0,high_humi_label
0,1
1,0
2,0
3,0
4,1
...,...
1090,1
1091,1
1092,1
1093,1


In [17]:
x = clean_data.copy()
x.columns

Index(['air_pressure_9am', 'air_temp_9am', 'avg_wind_direction_9am',
       'avg_wind_speed_9am', 'max_wind_direction_9am', 'max_wind_speed_9am',
       'rain_accumulation_9am', 'rain_duration_9am', 'relative_humidity_9am',
       'relative_humidity_3pm', 'high_humi_label'],
      dtype='object')

In [18]:
del x['relative_humidity_3pm']
del x['relative_humidity_9am']
del x['high_humi_label']

In [19]:
x.columns

Index(['air_pressure_9am', 'air_temp_9am', 'avg_wind_direction_9am',
       'avg_wind_speed_9am', 'max_wind_direction_9am', 'max_wind_speed_9am',
       'rain_accumulation_9am', 'rain_duration_9am'],
      dtype='object')

In [22]:
y.columns

Index(['high_humi_label'], dtype='object')

# split test and train

In [23]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=324)

In [24]:
type(x_train)

pandas.core.frame.DataFrame

In [25]:
y_train.describe()

Unnamed: 0,high_humi_label
count,712.0
mean,0.494382
std,0.50032
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


# training process

In [26]:
humidity_classifier = DecisionTreeClassifier(max_leaf_nodes=10, random_state=0)

In [27]:
type(humidity_classifier)

sklearn.tree.tree.DecisionTreeClassifier

In [29]:
humidity_classifier.fit(x_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=10,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=0, splitter='best')

# predict on test set

In [30]:
pred = humidity_classifier.predict(x_test)

In [32]:
pred.shape

(352,)

# measure accuracy

In [34]:
y_test['high_humi_label']

456    0
845    0
693    1
259    1
723    1
      ..
46     1
116    0
799    0
350    1
279    0
Name: high_humi_label, Length: 352, dtype: int64

In [35]:
type(pred)

numpy.ndarray

In [37]:
type(y_test['high_humi_label'].values)

numpy.ndarray

In [42]:
(1 - pred^y_test['high_humi_label'].values).sum() / pred.shape[0]

0.8153409090909091

In [44]:
accuracy_score(y_true=y_test, y_pred=pred)

0.8153409090909091