In [None]:
#import and install essential modules
#also to create DL models we need GPU that is inbuilt in GoogleColab

In [3]:
import numpy as np
import tensorflow as tf #The Engine

from tensorflow import keras #The driving Wheel or Creates neural networks using TensorFlow as it is built on TensorFlow
from sklearn.model_selection import train_test_split #to train models and testing
from tensorflow.keras import layers #the neural layers
import pandas as pd


In [4]:
#dummy data
df = pd.DataFrame({
    "soil_moisture": [0.10, 0.15, 0.20, 0.25, 0.40, 0.60, 0.35, 0.18,
                      0.45, 0.05, 0.80, 0.27, 0.55, 0.70, 0.12, 0.30],
    "temperature_c": [34, 30, 26, 22, 28, 30, 19, 22,
                      35, 24, 33, 33, 21, 25, 20, 29],
    "sunlight_hours": [9, 8, 7, 4, 8, 10, 3, 10,
                       12, 5, 9, 11, 2, 6, 1, 9],
    "needs_water": [1, 1, 1, 0, 0, 0, 0, 1,
                    0, 1, 0, 1, 0, 0, 1, 1]
})

In [5]:
df.head() #deep learning needs huge data to train but this is just for implementation tut

Unnamed: 0,soil_moisture,temperature_c,sunlight_hours,needs_water
0,0.1,34,9,1
1,0.15,30,8,1
2,0.2,26,7,1
3,0.25,22,4,0
4,0.4,28,8,0


In [6]:
df

Unnamed: 0,soil_moisture,temperature_c,sunlight_hours,needs_water
0,0.1,34,9,1
1,0.15,30,8,1
2,0.2,26,7,1
3,0.25,22,4,0
4,0.4,28,8,0
5,0.6,30,10,0
6,0.35,19,3,0
7,0.18,22,10,1
8,0.45,35,12,0
9,0.05,24,5,1


In [7]:
#also one thing to consider we need to scale features before train_test_split in real projects
#noramlization 0-1 (for deep learning using min-max scaler)
#standatization -> mean 0 and values b/w -3 and 3
#min-max scaler 0 and 1

In [8]:
x = df[['soil_moisture' ,'temperature_c' ,'sunlight_hours']]
y=df['needs_water']

In [9]:
x_min = x.min()
x_max = x.max()
x_scaled = (x - x_min) / (x_max - x_min - 1e-8)

In [10]:
x_scaled #this is because deep neural network works better on this kin of values

Unnamed: 0,soil_moisture,temperature_c,sunlight_hours
0,0.066667,0.9375,0.727273
1,0.133333,0.6875,0.636364
2,0.2,0.4375,0.545455
3,0.266667,0.1875,0.272727
4,0.466667,0.5625,0.636364
5,0.733333,0.6875,0.818182
6,0.4,0.0,0.181818
7,0.173333,0.1875,0.818182
8,0.533333,1.0,1.0
9,0.0,0.3125,0.363636


In [11]:
x_train, x_test, y_train, y_test = train_test_split(x_scaled , y, test_size=0.25, random_state=42) #training and testing data


In [12]:
#now we will cearte model using Keras which uses TensorFlow

In [13]:
model = keras.Sequential([
    #create layers for inputs
    layers.Input(shape=(x_train.shape[1],  )),
    #hidden layers
    layers.Dense(8 , activation='relu'), #How much hidden neurons in form of exponent of 2 , and all the hidden neurons will be connected to each of the input layers and also there can be multiple hidden layers
    #activation function for Hidden layer ->best one is ReLU so far

    #output layers (also uses Dense)
    layers.Dense(1 , activation='sigmoid') #one output neuron and 'sidmoid' for binary classification
]) #also all the parameters in the form of list

In [14]:
#we have just created a model and no data is provide till now

In [15]:
#now till tell the model which optimizer to be used 'loss' calculation parameter and evaluation calculation metric.

In [16]:
model.compile(
    optimizer='sgd', #we haven't learned about this so will use simple gradient decent(sgt)
    loss='binary_crossentropy', #we haven't learned about this so will use binary_crossentroy
    metrics=['accuracy'] #accuracy_score
)


In [17]:
#now we will see how the forward and backward propogation works

In [18]:
#1 Epoch

In [19]:
history = model.fit(
    x_train.values  , #neural network works in form of array so needs to pass data in form of array not as a complete dataset
    y_train.values ,
    validation_data=[x_test.values , y_test.values] ,
    epochs = 100, #100 forward + 100 Backward or training 100 times
    batch_size = 4 , #4 batch means take 4 data points at once for training at a time this reduces load on your machine, thing as taking 10000 data-rows f0r 100 epochs it will train on 10000*100 (training on each row) times so by taking 4 data in a batch it only will have to train (10000/4)*100 (training on 4 rows at a time)
    #so batch_size to reduce the weight on the computer
    verbose = 1 #see whats happening during training training (you can check with different values)
)


Epoch 1/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 118ms/step - accuracy: 0.4583 - loss: 0.6987 - val_accuracy: 0.7500 - val_loss: 0.6508
Epoch 2/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step - accuracy: 0.3646 - loss: 0.7067 - val_accuracy: 0.7500 - val_loss: 0.6516
Epoch 3/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step - accuracy: 0.5521 - loss: 0.6780 - val_accuracy: 0.7500 - val_loss: 0.6525
Epoch 4/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step - accuracy: 0.4583 - loss: 0.6870 - val_accuracy: 0.7500 - val_loss: 0.6533
Epoch 5/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step - accuracy: 0.4688 - loss: 0.6915 - val_accuracy: 0.7500 - val_loss: 0.6540
Epoch 6/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step - accuracy: 0.5000 - loss: 0.6955 - val_accuracy: 0.7500 - val_loss: 0.6547
Epoch 7/100
[1m3/3[0m [32m━━━━━━━━━━

In [20]:
#this ends here Now we will Move Forward

In [21]:
#now see what type of Loss Function For different types of scenarios:

In [22]:
#Types of Loss function:
  #Prediction/Regression based
    #MSE (Mean squared error) using for classification problem With batch_size = 1 for training of model And for batch_size > 1 use cost function (also MSE are used where prediction errors can be major)
    #MAE (Mean Absolute error) using for classification problem ((also MSE are used where we have to deal with prediction errors that are not major where errors can be major use MSE))
    #Huber loss (combination of both the MSE and MAE) when dealing with small errored prediction use MAE and when dealing With major errored predictions use MSE :. but you need to tune hyperparameters here explicitely like small error range values and also for big error.
    #MSLE (Mena squared logarithmic error) #use when we are predicting growths ex. Stock price growth , financial growth predictions etc.
      #for all the tasks using prediction based loss functions We'll use linear activation functions

  #classification based loss functions
    #binary classification -> binary_crossentropy with sigmoid activation function
    #categorial classification -> categorial_entropy with softmax activation function (here we need to apply One-hot-encoding first and categories will be formed as seperate features )(no. of output neurons => no. of categories)
    #sparsed category classification -> sparse_categorial_entropy with softmax activation function (no. of output neurons => no. of categories)


In [23]:
#Types of Optimizers: (to update weights and optimize outputs...)
  # /.Ex.. Consider the model will be trained in 100 epochs And in dataset there are 100000 rows:
    #Batch-Gradient-Descent : Send all rows at once to train model:
      # means model will update its weights 100 times in 100 epochs/
    #Stochastic-Gradient-Descent : Send one-one rows at once to train model: (1 epoch ---> 100000 updated)
      # means model will update its weights 100*100000 times in 100 epochs/
    #The Practical App. (Mini-Batch-Descent) : Send multiple rows according to batch_size (32, 64, 128....) in a batch to train model: (Lets say batch size of 100)
      #so batch sixe of 100 for 100000 rows (:. model will be trained in 100000/100 ==> 1000 times)
      # means model will update its weights 100000/100====>1000 times in 100 epochs/
    #Monotonic-Gradient-Descent : StochasticGD with Added Momentum(The gradient curve from local minima starts slow and noisy , then gradually takes momentum and reaches global minima bu smoothing the curve)
      #calculating gradient descent But now with momentum
    #AdaGrad : a more practical approach where we imporve optimization of our updates of weights using adaptive learning rate.
      #In it the learning on the gradient descent starts with higher learning rate and as it moves towards the global minima the learning rate decreases (That is fewer gaps b/w cost points on the gradient descent plane.)
      #the only thing we don't have in Adagrad is the momentum. like MonotonicGD.
      #this is the version of SGD which has noise but with adaptive Learning Rate (Starts with Big Noisy-LearningRate and gradually Decreases the LearningRate as well as Noise)
    #AdamOptimizer : now this is the final optimizer here we have which simply adds Momentum To the AdaGrad.
      #Adagrad + Momentum(Velocity)
      #So AdamOptimizer is the latest optimizer Which is currently most widely used....
      #So it starts with Big LearningRate + BigMomentum + NoisyUpdates Then reaches towards global Minima on the gradient descent place with less LearningRate + LessMomentum + Less NoisyUpdates

#these are all gradient descent optimizer types that just uses different approaches......

In [26]:
model.compile(
    optimizer='sgd',
    loss='binary_crossentropy',
    metrics=['accuracy'] #accuracy_score
)



In [29]:
history_batch = model.fit(
    x_train.values  ,
    y_train.values ,
    validation_data=[x_test.values , y_test.values] ,
    epochs = 100,
    batch_size = len(x_train) ,
    verbose = 1
)
#Batch-Gradient-Descent : slow when data is Huge and requires lots of RAM/VRAM


history_Stochastic = model.fit(
    x_train.values  ,
    y_train.values ,
    validation_data=[x_test.values , y_test.values] ,
    epochs = 100,
    batch_size = 1 ,
    verbose = 1
)
#Stochastic-Gradient-Descent : Updates are very noisy , slower to reach exact minimum compared to mini_batch

history_mini_batch = model.fit(
    x_train.values  ,
    y_train.values ,
    validation_data=[x_test.values , y_test.values] ,
    epochs = 100,
    batch_size = 4,
    verbose = 1
)
#Mini-Batch-Gradient-Descent : balance between BAtchGD And StochasticGD (less noisy than StochasticGD)



Epoch 1/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 214ms/step - accuracy: 1.0000 - loss: 0.1837 - val_accuracy: 0.7500 - val_loss: 0.2429
Epoch 2/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 172ms/step - accuracy: 1.0000 - loss: 0.1837 - val_accuracy: 0.7500 - val_loss: 0.2429
Epoch 3/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 321ms/step - accuracy: 1.0000 - loss: 0.1836 - val_accuracy: 0.7500 - val_loss: 0.2428
Epoch 4/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 161ms/step - accuracy: 1.0000 - loss: 0.1836 - val_accuracy: 0.7500 - val_loss: 0.2428
Epoch 5/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 158ms/step - accuracy: 1.0000 - loss: 0.1836 - val_accuracy: 0.7500 - val_loss: 0.2427
Epoch 6/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 152ms/step - accuracy: 1.0000 - loss: 0.1835 - val_accuracy: 0.7500 - val_loss: 0.2427
Epoch 7/100
[1m1/1[0m [32m━━━━━

In [30]:
#for the Implementation of Monotonic Gradient Descent
#we need
from tensorflow.keras import optimizers

In [31]:
opt = optimizers.SGD(learning_rate=0.01 , momentum=0.9)

In [33]:
from tensorflow.keras.optimizers import SGD

# Define optimizer with Momentum
opt = SGD(learning_rate=0.01, momentum=0.9)

# Compile model
model.compile(
    optimizer=opt,
    loss='binary_crossentropy',
    metrics=['accuracy']
)

# Train model
history_MGD = model.fit(
    x_train.values,
    y_train.values,
    validation_data=(x_test.values, y_test.values),
    epochs=100,
    batch_size=1,
    verbose=1
)
#MonotonicGD


Epoch 1/100
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 24ms/step - accuracy: 1.0000 - loss: 0.0781 - val_accuracy: 0.7500 - val_loss: 0.2189
Epoch 2/100
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 1.0000 - loss: 0.1254 - val_accuracy: 1.0000 - val_loss: 0.1770
Epoch 3/100
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.9872 - loss: 0.1451 - val_accuracy: 1.0000 - val_loss: 0.1627
Epoch 4/100
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 1.0000 - loss: 0.1470 - val_accuracy: 0.7500 - val_loss: 0.2759
Epoch 5/100
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.9725 - loss: 0.1030 - val_accuracy: 0.7500 - val_loss: 0.2324
Epoch 6/100
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.9872 - loss: 0.0918 - val_accuracy: 1.0000 - val_loss: 0.1186
Epoch 7/100
[1m12/12[0m [32m━