# **Imporintg Libraries**

In [None]:
!pip install -q pgmpy
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scipy
import warnings
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.exceptions import ConvergenceWarning
from pgmpy.models import BayesianNetwork
from pgmpy.estimators import BayesianEstimator
from pgmpy.estimators import MaximumLikelihoodEstimator
from pgmpy.inference import VariableElimination


warnings.filterwarnings("ignore")

# **Data Preprocessing**

**Importing Dataset**

In [None]:
dataset = pd.read_csv('weather.csv')
dataset.head(5)

Unnamed: 0.1,Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,0,1,13.4,22.9,0.6,5.609501,7.718384,13,44.0,13,...,71.0,22.0,1007.7,1007.1,8.0,4.491408,16.9,21.8,0,0
1,1,1,7.4,25.1,0.0,5.609501,7.718384,14,44.0,6,...,44.0,25.0,1010.6,1007.8,4.407404,4.491408,17.2,24.3,0,0
2,2,1,12.9,25.7,0.0,5.609501,7.718384,15,46.0,13,...,38.0,30.0,1007.6,1008.7,4.407404,2.0,21.0,23.2,0,0
3,3,1,9.2,28.0,0.0,5.609501,7.718384,4,24.0,9,...,45.0,16.0,1017.6,1012.8,4.407404,4.491408,18.1,26.5,0,0
4,4,1,17.5,32.3,1.0,5.609501,7.718384,13,41.0,1,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,0,0


**Dropping unnecessary Columns and removing null values**

In [None]:
dataset = dataset.drop(columns = ['Unnamed: 0'])
dataset = dataset.replace({'False': 0, 'True': 1})
dataset.dropna(inplace = True)
dataset.head(5)

Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,1,13.4,22.9,0.6,5.609501,7.718384,13,44.0,13,14,...,71.0,22.0,1007.7,1007.1,8.0,4.491408,16.9,21.8,0,0
1,1,7.4,25.1,0.0,5.609501,7.718384,14,44.0,6,15,...,44.0,25.0,1010.6,1007.8,4.407404,4.491408,17.2,24.3,0,0
2,1,12.9,25.7,0.0,5.609501,7.718384,15,46.0,13,15,...,38.0,30.0,1007.6,1008.7,4.407404,2.0,21.0,23.2,0,0
3,1,9.2,28.0,0.0,5.609501,7.718384,4,24.0,9,0,...,45.0,16.0,1017.6,1012.8,4.407404,4.491408,18.1,26.5,0,0
4,1,17.5,32.3,1.0,5.609501,7.718384,13,41.0,1,7,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,0,0


# **Train set and Test set**

1. Splitting the dataset into training and testing sets.

2. Scaling the dataset for more efficient training.

In [None]:
X = dataset.drop(columns = ['RainTomorrow', 'Location'])
y = dataset['RainTomorrow']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 8)

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# **Linear Models**

Linear models assume a linear relationship between input features and the target variable. Examples include Linear Regression and Logistic Regression. They are interpretable, computationally efficient, and work well when relationships are approximately linear.

**Logistic Regression vs Naive Bayes**

Logistic Regression is a linear model that predicts probabilities based on input features, suitable for linear relationships. Naive Bayes is a probabilistic model assuming feature independence, often used for text classification. Logistic Regression offers interpretability, while Naive Bayes provides computational efficiency, particularly for high-dimensional data like text.

## **Logistic Regression**

Before tuning Hyperparameters for Logistic Regression.

In [None]:
lr_before = LogisticRegression(max_iter = 1500, solver = 'liblinear')
lr_before.fit(X_train, y_train)
y_pred_before = lr_before.predict(X_test)
accuracy_before = accuracy_score(y_test, y_pred_before)
print("Accuracy before hyperparameter tuning:", round(accuracy_before * 100, 2), "%")

Accuracy before hyperparameter tuning: 84.46 %


After tuning Hyperparameters.

In [None]:
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100]}

lr_after = GridSearchCV(LogisticRegression(max_iter = 2000, solver = 'liblinear'), param_grid, cv=5)
lr_after.fit(X_train, y_train)
y_pred_after = lr_after.predict(X_test)
accuracy_after = accuracy_score(y_test, y_pred_after)
print("Accuracy after hyperparameter tuning:", round(accuracy_after * 100, 2), "%")
print("Best hyperparameters:", lr_after.best_params_)

Accuracy after hyperparameter tuning: 84.46 %
Best hyperparameters: {'C': 0.1}


## **Naive Bayes Classifier**

Before Smoothing the data.

In [None]:
gnb_before = GaussianNB()
gnb_before.fit(X_train, y_train)
y_pred_before = gnb_before.predict(X_test)
accuracy_before = accuracy_score(y_test, y_pred_before)
print("Accuracy before smoothing:", round(accuracy_before * 100, 2), "%")

Accuracy before smoothing: 79.06 %


After Smoothing the data.

In [None]:
param_grid = {'var_smoothing': np.logspace(0,-9, num=100)}

gnb_after = GridSearchCV(GaussianNB(), param_grid, cv=5)
gnb_after.fit(X_train, y_train)
y_pred_after = gnb_after.predict(X_test)
accuracy_after = accuracy_score(y_test, y_pred_after)
print("Accuracy after smoothing:", round(accuracy_after * 100, 2), "%")
print("Best hyperparameters:", gnb_after.best_params_)

Accuracy after smoothing: 83.5 %
Best hyperparameters: {'var_smoothing': 0.3511191734215131}


# **Non Linear Models**

Non-linear models capture complex relationships between variables, unlike linear models. Examples include Decision Trees, Neural Networks, and Support Vector Machines with non-linear kernels. They are suitable for data with intricate patterns and interactions.

**Decision Trees vs Neural Networks**

Decision Trees are interpretable, non-linear models suitable for tabular data. Neural Networks are complex, non-linear models capable of learning intricate patterns but less interpretable. Decision Trees excel with smaller datasets and interpretable rules, while Neural Networks perform well with large datasets and complex relationships but require more computational resources.

## **Decision Tree Classifier**

Before tuning Hyperparameters.

In [None]:
warnings.filterwarnings("ignore")
dt_before = DecisionTreeClassifier(random_state=42)
dt_before.fit(X_train, y_train)
y_pred_before = dt_before.predict(X_test)
accuracy_before = accuracy_score(y_test, y_pred_before)
print("Accuracy before hyperparameter tuning:", round(accuracy_before * 100, 2), "%")

Accuracy before hyperparameter tuning: 78.37 %


After tuning Hyperparameters.

In [None]:
param_grid = {
    'max_depth': [None, 5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}

dt_after = GridSearchCV(DecisionTreeClassifier(random_state=42), param_grid, cv=5)
dt_after.fit(X_train, y_train)
y_pred_after = dt_after.predict(X_test)
accuracy_after = accuracy_score(y_test, y_pred_after)
print("Accuracy after hyperparameter tuning:", round(accuracy_after * 100, 2), "%")
print("Best hyperparameters:", dt_after.best_params_)

Accuracy after hyperparameter tuning: 83.42 %
Best hyperparameters: {'max_depth': 10, 'max_features': 'auto', 'min_samples_leaf': 2, 'min_samples_split': 5}


## **Neural Networks**

Before tuning Hyperparameters.

In [None]:
mlp_before = MLPClassifier(random_state=42)
mlp_before.fit(X_train_scaled, y_train)
y_pred_before = mlp_before.predict(X_test_scaled)
accuracy_before = accuracy_score(y_test, y_pred_before)
print("Accuracy before hyperparameter tuning:", accuracy_before)


Accuracy before hyperparameter tuning: 0.8554384376616658


After tuning Hyperparameters.

In [None]:
# Define hyperparameters grid
param_grid =  {
    'hidden_layer_sizes': [(50,),(100,)]
}

# Neural Network after tuning hyperparameters
mlp_after = GridSearchCV(MLPClassifier(random_state=42), param_grid,n_jobs=-1, cv=5)
mlp_after.fit(X_train_scaled, y_train)
y_pred_after = mlp_after.predict(X_test_scaled)
accuracy_after = accuracy_score(y_test, y_pred_after)
print("Accuracy after hyperparameter tuning:", accuracy_after)
print("Best hyperparameters:", mlp_after.best_params_)

Accuracy after hyperparameter tuning: 0.857087428867046
Best hyperparameters: {'hidden_layer_sizes': (50,)}


# **Hybrid Models**

Hybrid models combine elements of different types of models, such as combining neural networks with decision trees or blending linear and non-linear models. They aim to leverage the strengths of each component to improve overall performance and interpretability.

**Support Vector Machines vs Bayesian Networks**

Support Vector Machines (SVMs) find optimal hyperplanes for classification, suitable for high-dimensional data. Bayesian Networks model probabilistic dependencies between variables using graph structures, offering insights into causality. SVMs are powerful for classification, while Bayesian Networks excel in probabilistic reasoning and causal inference.

## **Support Vector Machines**

Before tuning Hyperparameter.

In [None]:
svm_before = SVC(random_state=42)
svm_before.fit(X_train_scaled, y_train)
y_pred_before = svm_before.predict(X_test_scaled)
accuracy_before = accuracy_score(y_test, y_pred_before)
print("Accuracy before hyperparameter tuning:", round(accuracy_before * 100, 2), "%")

After tuning Hyperparameter.

In [None]:
param_grid = {'C': [0.1, 1, 10, 100],
              'gamma': [1, 0.1, 0.01, 0.001]}

svm_after = GridSearchCV(SVC(), param_grid, cv=5)
svm_after.fit(X_train_scaled1, y_train)
y_pred_after = svm_after.predict(X_test_scaled1)
accuracy_after = accuracy_score(y_test, y_pred_after)
print("Accuracy after hyperparameter tuning:", round(accuracy_after * 100, 2), "%")
print("Best hyperparameters:", svm_after.best_params_)

## **Bayesian Networks**

Before tuning Hyperparameters.

In [None]:
X = dataset[['RainToday', 'MaxTemp', 'Rainfall','RainTomorrow']]
print("Attributes passed to the model",X.columns.tolist())

X_train, X_test = train_test_split(X, test_size=0.2, random_state=42)

# Create the structure of the simplified Bayesian Network
model = BayesianNetwork([
    ('RainToday', 'RainTomorrow'),
    ('MaxTemp', 'RainTomorrow'),
    ('Rainfall', 'RainTomorrow')
])

# Estimate the parameters of the model using BayesianEstimator
model.fit(X_train, estimator=BayesianEstimator, n_jobs=-1)

Rain_infer = VariableElimination(model)

print('\n 1. Probability of RainTomorrow given RainToday= 1')
q1=Rain_infer.query(variables=['RainTomorrow'],evidence={'RainToday':1})
print(q1)

Attributes passed to the model ['RainToday', 'MaxTemp', 'Rainfall', 'RainTomorrow']

 1. Probability of RainTomorrow given RainToday= 1
+-----------------+---------------------+
| RainTomorrow    |   phi(RainTomorrow) |
| RainTomorrow(0) |              0.5092 |
+-----------------+---------------------+
| RainTomorrow(1) |              0.4908 |
+-----------------+---------------------+


# **Inference**

1. **Linear models:**

      It can be observed that Logistic Regression gives more accuracy than Naive

      Accuracy for Logistic Regression: **84.46 %**

      Accuracy for Naive Bayes Classifier: **83.5 %**

2. **Non Linear Models:**

      It can be observed that Neural network gives higher accuracy than Decision Tree Classifier

      Accuracy for Decesion Tree Classifier: **83.42 %**

      Accuracy for Nerual Networks: **85.7%**

3. **Hybrid Models:**

      In case of Hybrid Models it can be observed that Support Vector Machines gives Higher Accuracy compared to Bayes Network

      Accuracy for Support Vector Machine: **85.66%**

      Accuracy for Bayes Network: **51.24**





