**Importing all necessary libraries**


In [None]:
import pandas as pd 
from sklearn.metrics import accuracy_score 
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

In [None]:
data = pd.read_csv("/content/daily_weather.csv")
data.head()

Unnamed: 0,number,air_pressure_9am,air_temp_9am,avg_wind_direction_9am,avg_wind_speed_9am,max_wind_direction_9am,max_wind_speed_9am,rain_accumulation_9am,rain_duration_9am,relative_humidity_9am,relative_humidity_3pm
0,0,918.06,74.822,271.1,2.080354,295.4,2.863283,0.0,0.0,42.42,36.16
1,1,917.347688,71.403843,101.935179,2.443009,140.471548,3.533324,0.0,0.0,24.328697,19.426597
2,2,923.04,60.638,51.0,17.067852,63.7,22.100967,0.0,20.0,8.9,14.46
3,3,920.502751,70.138895,198.832133,4.337363,211.203341,5.190045,0.0,0.0,12.189102,12.742547
4,4,921.16,44.294,277.8,1.85666,136.5,2.863283,8.9,14730.0,92.41,76.74


In [None]:
data.columns

Index(['number', 'air_pressure_9am', 'air_temp_9am', 'avg_wind_direction_9am',
       'avg_wind_speed_9am', 'max_wind_direction_9am', 'max_wind_speed_9am',
       'rain_accumulation_9am', 'rain_duration_9am', 'relative_humidity_9am',
       'relative_humidity_3pm'],
      dtype='object')

**Data Cleaning**

In [None]:
data.isnull().any().any()

True

In [None]:
data.isnull().sum() 

number                    0
air_pressure_9am          3
air_temp_9am              5
avg_wind_direction_9am    4
avg_wind_speed_9am        3
max_wind_direction_9am    3
max_wind_speed_9am        4
rain_accumulation_9am     6
rain_duration_9am         3
relative_humidity_9am     0
relative_humidity_3pm     0
dtype: int64

In [None]:
# Print the rows with missing values 
data[data.isnull().any(axis = 1)]

Unnamed: 0,number,air_pressure_9am,air_temp_9am,avg_wind_direction_9am,avg_wind_speed_9am,max_wind_direction_9am,max_wind_speed_9am,rain_accumulation_9am,rain_duration_9am,relative_humidity_9am,relative_humidity_3pm
16,16,917.89,,169.2,2.192201,196.8,2.930391,0.0,0.0,48.99,51.19
111,111,915.29,58.82,182.6,15.613841,189.0,,0.0,0.0,21.5,29.69
177,177,915.9,,183.3,4.719943,189.9,5.346287,0.0,0.0,29.26,46.5
262,262,923.596607,58.380598,47.737753,10.636273,67.145843,13.671423,0.0,,17.990876,16.461685
277,277,920.48,62.6,194.4,2.751436,,3.869906,0.0,0.0,52.58,54.03
334,334,916.23,75.74,149.1,2.751436,187.5,4.183078,,1480.0,31.88,32.9
358,358,917.44,58.514,55.1,10.021491,,12.705819,0.0,0.0,13.88,25.93
361,361,920.444946,65.801845,49.823346,21.520177,61.886944,25.549112,,40.364018,12.278715,7.618649
381,381,918.48,66.542,90.9,3.467257,89.4,4.406772,,0.0,20.64,14.35
409,409,,67.853833,65.880616,4.328594,78.570923,5.216734,0.0,0.0,18.487385,20.356594


In [None]:
# We do not need to number the rows as Pandas provides its's own indexing 
del data['number']
data.columns

Index(['air_pressure_9am', 'air_temp_9am', 'avg_wind_direction_9am',
       'avg_wind_speed_9am', 'max_wind_direction_9am', 'max_wind_speed_9am',
       'rain_accumulation_9am', 'rain_duration_9am', 'relative_humidity_9am',
       'relative_humidity_3pm'],
      dtype='object')

In [None]:
#Drop out rows with null values
before_rows = data.shape[0]
data = data.dropna()
after_rows = data.shape[0]

In [None]:
print("The number of dropped rows are {}".format(before_rows - after_rows))

The number of dropped rows are 31



**Convert to a Classification task**

**Binarize the relative humidity_3pm to 0 or 1**

We are assigning the values 0 or 1 and adding a new column 'high humidity label'. We are basically classifying the data into two categories ( binary problem ) by setting a desired value ( 24.99 , in this case ) to be the threshold and anything above is high ( 1 ) and anything below is low ( 0 ).

In [None]:
clean_data = data.copy() # New data frame to avoid confusion 
clean_data['high_humidity_label'] = (clean_data['relative_humidity_3pm'] > 24.99) * 1
print(clean_data['high_humidity_label'])

0       1
1       0
2       0
3       0
4       1
       ..
1090    1
1091    1
1092    1
1093    1
1094    0
Name: high_humidity_label, Length: 1064, dtype: int64


In [None]:
y = clean_data[['high_humidity_label']].copy()
y

Unnamed: 0,high_humidity_label
0,1
1,0
2,0
3,0
4,1
...,...
1090,1
1091,1
1092,1
1093,1


In [None]:
clean_data['relative_humidity_3pm'].head()

0    36.160000
1    19.426597
2    14.460000
3    12.742547
4    76.740000
Name: relative_humidity_3pm, dtype: float64

In [None]:
y.head()

Unnamed: 0,high_humidity_label
0,1
1,0
2,0
3,0
4,1


Use 9am Sensor signals to predict Humidity at 3PM

In [None]:
time = '9am'
features = list(clean_data.columns[clean_data.columns.str.contains(time)])

# we do not need relative humidity at 9am 
features.remove('relative_humidity_9am')

features

['air_pressure_9am',
 'air_temp_9am',
 'avg_wind_direction_9am',
 'avg_wind_speed_9am',
 'max_wind_direction_9am',
 'max_wind_speed_9am',
 'rain_accumulation_9am',
 'rain_duration_9am']

In [None]:
# Make the data of these features as X
X = clean_data[features].copy()
X

Unnamed: 0,air_pressure_9am,air_temp_9am,avg_wind_direction_9am,avg_wind_speed_9am,max_wind_direction_9am,max_wind_speed_9am,rain_accumulation_9am,rain_duration_9am
0,918.060000,74.822000,271.100000,2.080354,295.400000,2.863283,0.0,0.0
1,917.347688,71.403843,101.935179,2.443009,140.471548,3.533324,0.0,0.0
2,923.040000,60.638000,51.000000,17.067852,63.700000,22.100967,0.0,20.0
3,920.502751,70.138895,198.832133,4.337363,211.203341,5.190045,0.0,0.0
4,921.160000,44.294000,277.800000,1.856660,136.500000,2.863283,8.9,14730.0
...,...,...,...,...,...,...,...,...
1090,918.900000,63.104000,192.900000,3.869906,207.300000,5.212070,0.0,0.0
1091,918.710000,49.568000,241.600000,1.811921,227.400000,2.371156,0.0,0.0
1092,916.600000,71.096000,189.300000,3.064608,200.800000,3.892276,0.0,0.0
1093,912.600000,58.406000,172.700000,3.825167,189.100000,4.764682,0.0,0.0


Perform the test and Train split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 324)

In [None]:
y_train.describe()

Unnamed: 0,high_humidity_label
count,712.0
mean,0.494382
std,0.50032
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


In [None]:
X_train.describe()

Unnamed: 0,air_pressure_9am,air_temp_9am,avg_wind_direction_9am,avg_wind_speed_9am,max_wind_direction_9am,max_wind_speed_9am,rain_accumulation_9am,rain_duration_9am
count,712.0,712.0,712.0,712.0,712.0,712.0,712.0,712.0
mean,918.913897,65.194366,142.12333,5.568288,147.789373,7.089629,0.202399,282.884615
std,3.147923,11.210412,68.773699,4.467828,67.186609,5.486311,1.628988,1584.404987
min,907.99,36.752,15.5,0.782929,31.8,1.185578,0.0,0.0
25%,916.727792,57.3755,65.282862,2.304048,75.978656,3.168182,0.0,0.0
50%,919.0,65.967556,166.225018,3.958486,176.8,5.077854,0.0,0.0
75%,921.134993,73.9085,190.5,7.504934,201.4,9.104346,0.0,0.0
max,929.32,91.112,343.4,21.541732,299.2,26.351153,24.02,17704.0


Fit the model on the training set

In [None]:
humidity_classifier = DecisionTreeClassifier(max_leaf_nodes = 10, random_state = 0)
humidity_classifier.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=10,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=0, splitter='best')

In [None]:
type(humidity_classifier)

sklearn.tree._classes.DecisionTreeClassifier

Test the model on the testing set

In [None]:
predictions = humidity_classifier.predict(X_test)
type(predictions)

numpy.ndarray

In [None]:
predictions[:10]
#predictions[:len(predictions)]

array([0, 0, 1, 1, 1, 1, 0, 0, 0, 1])

In [None]:
y_test[['high_humidity_label']][:10]

Unnamed: 0,high_humidity_label
456,0
845,0
693,1
259,1
723,1
224,1
300,1
442,0
585,1
1057,1


Measure the accuracy of the data

In [None]:
accuracy_score(y_test, y_pred = predictions)

0.8153409090909091

Measuring the mean squared error

In [None]:
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test, y_pred = predictions)

0.1846590909090909