<a href="https://colab.research.google.com/github/DataSayant1st/MachineLearning_and_AI/blob/main/Decision_Trees_Weather.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Weather Forecast with Decision Tree Regression


In [None]:
# Libraries

import pandas as pd
import numpy as np


In [None]:
df = pd.read_csv('weatherHistory.csv')
df.head()

In [None]:
# droping unnecessary columns
df = df.drop(columns =['Formatted Date','Daily Summary','Summary'])

In [None]:
df

In [None]:
df.dtypes

In [None]:
# One-hot encoding for precipitation type

one_hot_encoding = pd.get_dummies(df['Precip Type'])
one_hot_encoding.describe()

In [None]:
# concatenation
df = pd.concat([df, one_hot_encoding], axis = 1)
df = df.drop(columns = ['Precip Type', 'Apparent Temperature (C)','Cloud Cover',])

In [None]:
df.head()

In [None]:
df.dtypes

In [None]:
# Creating variables

X = df.drop(columns = ['Temperature (C)'])
y = pd.DataFrame(df['Temperature (C)'])

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.33, random_state = 87)

# Trying Linear regression with Elastic Net Regularisation

In [None]:
# Apparently it kinda tries to fit into a linear model; fitting the points to the line. It's kinda like linear regression with regularisation

In [None]:
# In summary, regularisation helps to prevent overfitting by penalising complex models WHILE normalisation is a pre-processing scaling step to make sure that the needed data points get the opportunity to contribute


In [None]:
from sklearn.linear_model import ElasticNet
from sklearn.metrics import r2_score, explained_variance_score

In [None]:
elastic_net = ElasticNet(random_state = 63, max_iter = 10000)
elastic_net.fit(X_train, y_train)
# random state is kinda like throwing dice, to make sure that our split is as random as possible

In [None]:
# making predictions
y_hat = elastic_net.predict(X_test)

In [None]:
R2 = r2_score(y_true = y_test, y_pred = y_hat)
exp_var = explained_variance_score(y_true = y_test, y_pred =y_hat)

print(R2, exp_var)

In [None]:
X_train.describe()

# Making a decision Tree Regressor

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score, explained_variance_score
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt

In [None]:
decision_tree = DecisionTreeRegressor(max_depth = 10, criterion = "squared_error", random_state = 11)
decision_tree.fit(X_train, y_train)


In [None]:
y_hat = decision_tree.predict(X_test)

R2 = r2_score(y_true = y_test, y_pred = y_hat)
exp_var = explained_variance_score(y_true = y_hat, y_pred = y_hat)

In [None]:
print(R2, exp_var)


In [None]:
# we experimented with simple linear regression model WITHOUT regularisation and it performed better

In [None]:
plt.figure(figsize = (10,10))
p = plot_tree(decision_tree,
             feature_names= X_test.columns.tolist(),
             filled = True,
             proportion = True,
             rounded = True)

In [None]:
len(X_train.keys())

# it will give u the number of features / columns in our training dataset

In [None]:
plt.figure()
plt.plot(y_test,y_hat, '*r')
plt.xlabel('y_true')
plt.ylabel('y_predicted')

x = np.linspace(-20,40)
plt.plot(x,x,'k')


# Partial dependency plot

In [None]:
from sklearn.inspection import PartialDependenceDisplay

In [None]:
features = [0,1,2,3,4,5,6]
# seven items because we have seven features in our dataset

fig, ax = plt.subplots(figsize = (5,5))
PartialDependenceDisplay.from_estimator(decision_tree, X_test, features, ax = ax)

# seeing how different features influence the output of our model

# from the below graphs, humidity had the biggest impact to our model

In [None]:
# we can do the same above step for Linear regression. Linear Reghression model showed that all features contributed a lot not just humidity

In [None]:
features = [(0, 4)]

PartialDependenceDisplay.from_estimator(decision_tree, X_test, features)

# Decision Tree Classifier

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split


In [None]:
df = pd.read_csv('weatherHistory.csv')
df = df.drop(columns =['Formatted Date','Daily Summary','Summary', 'Apparent Temperature (C)', 'Cloud Cover'])
df.head

In [None]:
df.dtypes

In [None]:
df["Precip Type"].isnull().sum()

In [None]:
df["Precip Type"].fillna("none", inplace=True)

In [None]:
pd.get_dummies(df["Precip Type"])

Unnamed: 0,none,rain,snow
0,False,True,False
1,False,True,False
2,False,True,False
3,False,True,False
4,False,True,False
...,...,...,...
96448,False,True,False
96449,False,True,False
96450,False,True,False
96451,False,True,False


In [None]:
df.dtypes

Precip Type                object
Temperature (C)           float64
Humidity                  float64
Wind Speed (km/h)         float64
Wind Bearing (degrees)    float64
Visibility (km)           float64
Pressure (millibars)      float64
dtype: object

In [None]:
X = df.drop(columns = ["Precip Type"])
y = df["Precip Type"].astype('category').cat.codes

class_names = df["Precip Type"].astype('category').cat.categories

print(X.shape)
print(y.shape)
print(class_names)

(96453, 6)
(96453,)
Index(['none', 'rain', 'snow'], dtype='object')


In [None]:
X.head

<bound method NDFrame.head of        Temperature (C)  Humidity  Wind Speed (km/h)  Wind Bearing (degrees)  \
0             9.472222      0.89            14.1197                   251.0   
1             9.355556      0.86            14.2646                   259.0   
2             9.377778      0.89             3.9284                   204.0   
3             8.288889      0.83            14.1036                   269.0   
4             8.755556      0.83            11.0446                   259.0   
...                ...       ...                ...                     ...   
96448        26.016667      0.43            10.9963                    31.0   
96449        24.583333      0.48            10.0947                    20.0   
96450        22.038889      0.56             8.9838                    30.0   
96451        21.522222      0.60            10.5294                    20.0   
96452        20.438889      0.61             5.8765                    39.0   

       Visibility (km

In [None]:
y.describe()

count    96453.000000
mean         1.105699
std          0.324420
min          0.000000
25%          1.000000
50%          1.000000
75%          1.000000
max          2.000000
dtype: float64

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.33, random_state = 82)

# Model Training

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
tree_classifier = DecisionTreeClassifier(max_depth = 9, criterion = 'gini')
tree_classifier.fit(X_train,y_train)
# max_depth parameter controls the maximum depth of the tree; controls the number of nodes

# Model evaluation

In [None]:
from sklearn.metrics import accuracy_score, recall_score, f1_score
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt

In [None]:
acc = accuracy_score(y_true = y_test, y_pred = y_hat)
rec = recall_score(y_true = y_test, y_pred = y_hat, average = None)
f1 = f1_score(y_true=y_test, y_pred = y_hat, average = None)

print("ACC: ", acc, "Recall: ", rec, " F1:", f1)

ValueError: Classification metrics can't handle a mix of multiclass and continuous targets

In [None]:
plt.figure(figsize = (8,8))

plot_tree(tree_classifier,
         class_names = class_names,
         feature_names = X_test.keys(),
         filled = True,
         proportion = True,
         rounded = True
         )
plt.show()


# this tree is assymetrical compared to the LinReg tree

InvalidParameterError: The 'feature_names' parameter of plot_tree must be an instance of 'list' or None. Got Index(['Temperature (C)', 'Humidity', 'Wind Speed (km/h)',
       'Wind Bearing (degrees)', 'Visibility (km)', 'Pressure (millibars)'],
      dtype='object') instead.

<Figure size 800x800 with 0 Axes>