In [189]:
import pandas as pd
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

In [190]:
data = pd.read_csv("./daily_weather.csv")

In [191]:
# deleting number column from data beacuse it only represent index number of a row
del data['number'] 

In [192]:
data = data.dropna() # cleaning data by removing null values

In [193]:
clean_data = data.copy() # copying cleaned data into new variable

In [194]:
# making a new column called "high_humidity_level" that is in 0,1
# it will be 1 if "relative_humidity_3pm" at a particular row is > 28
# it will be 0 if "relative_humidity_3pm" at a particular row is < 28
# to make dependent feature we do it
clean_data["high_humidity_level"] = (clean_data['relative_humidity_3pm'] > 28) * 1

In [195]:
# y will have dependent features 
y = clean_data[['high_humidity_level']].copy()
y.head()

Unnamed: 0,high_humidity_level
0,1
1,0
2,0
3,0
4,1


In [196]:
morning_features = ['air_pressure_9am', 'air_temp_9am', 'avg_wind_direction_9am',
       'avg_wind_speed_9am', 'max_wind_direction_9am', 'max_wind_speed_9am',
       'rain_accumulation_9am', 'rain_duration_9am', 'relative_humidity_9am']

In [197]:
# copying data from clean_data with all the columns except  relative_humidity_3pm
# x will have independent features
x = clean_data[morning_features].copy()

In [198]:
x.columns

Index(['air_pressure_9am', 'air_temp_9am', 'avg_wind_direction_9am',
       'avg_wind_speed_9am', 'max_wind_direction_9am', 'max_wind_speed_9am',
       'rain_accumulation_9am', 'rain_duration_9am', 'relative_humidity_9am'],
      dtype='object')

In [199]:
y.columns

Index(['high_humidity_level'], dtype='object')

In [200]:
# splitting data in train and test model
X_train,X_test,y_train,y_test = train_test_split(x,y,test_size=0.33,random_state=324)

In [201]:
# fitting decison tree model on data
# intilize decision tree
humidity_classifier = DecisionTreeClassifier(max_leaf_nodes=7,random_state=0)
# max_leaf_nodes=10 to prevent overfitting of model
humidity_classifier.fit(X_train,y_train)

# after runnig above two lines we get our model

In [202]:
# predicting value on test data
y_predict =  humidity_classifier.predict(X_test)

In [203]:
# checking accuracy
accuracy_score(y_test,y_predict)*100

90.3409090909091

In [204]:
confusion_matrix(y_test,y_predict)

array([[160,  18],
       [ 16, 158]], dtype=int64)