# Neural Networks Hyperparameter Tuning with Pytorch

Hyperparameters are a set of parameters whose value controls the learning process of the model. The performance of models can be greatly improved by tuning their hyperparameters. Tuning hyperparameters means you are trying to find out the set of optimal parameters, giving you better performance than the default hyperparameters of the model. 

### 1. Importing relevant libraries

In [1]:
# !pip install matplotlib==3.8.2
# !pip install numpy==1.26.2
# !pip install pandas==2.1.4
# !pip install scikit_learn==1.4.2
# !pip install seaborn==0.13.2
# !pip install torch==2.2.2
# !pip install torchvision==0.17.2

In [2]:
from torch import nn
import pandas as pd
import matplotlib as plt

In [3]:
df = pd.read_csv("../Input/data.csv")

In [4]:
df.head(2)

Unnamed: 0,year,customer_id,phone_no,gender,age,no_of_days_subscribed,multi_screen,mail_subscribed,weekly_mins_watched,minimum_daily_mins,maximum_daily_mins,weekly_max_night_mins,videos_watched,maximum_days_inactive,customer_support_calls,churn
0,2015,100198,409-8743,Female,36,62,no,no,148.35,12.2,16.81,82,1,4.0,1,0.0
1,2015,100643,340-5930,Female,39,149,no,no,294.45,7.7,33.37,87,3,3.0,2,0.0


In [5]:
df.shape

(2000, 16)

#### 2. Data Cleaning

In [6]:
#Dropping the columns which doesnot make any sense in prediction
data = df.drop(["customer_id", "phone_no", "year"], axis=1)

In [7]:
data.tail(2)

Unnamed: 0,gender,age,no_of_days_subscribed,multi_screen,mail_subscribed,weekly_mins_watched,minimum_daily_mins,maximum_daily_mins,weekly_max_night_mins,videos_watched,maximum_days_inactive,customer_support_calls,churn
1998,Male,40,94,no,no,178.05,10.4,20.18,100,6,,3,0.0
1999,Male,37,73,no,no,326.7,10.3,37.03,89,6,3.0,1,1.0


In [8]:
data.shape

(2000, 13)

In [9]:
#checking null values
data.isna().sum()

gender                    24
age                        0
no_of_days_subscribed      0
multi_screen               0
mail_subscribed            0
weekly_mins_watched        0
minimum_daily_mins         0
maximum_daily_mins         0
weekly_max_night_mins      0
videos_watched             0
maximum_days_inactive     28
customer_support_calls     0
churn                     35
dtype: int64

In [10]:
#dropping null values
data = data.dropna(axis=0)

In [11]:
#shape of data after dropping null values
data.shape

(1918, 13)

### 3.Data Preprocessing

In [12]:
print(data["gender"].unique())
print(data["multi_screen"].unique())
print(data["mail_subscribed"].unique())

['Female' 'Male']
['no' 'yes']
['no' 'yes']


In [13]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [14]:
#label encoding categorical features
data["gender"] = le.fit_transform(data["gender"])
data["multi_screen"] = le.fit_transform(data["multi_screen"])
data["mail_subscribed"] = le.fit_transform(data["mail_subscribed"])

In [15]:
data.head(2)

Unnamed: 0,gender,age,no_of_days_subscribed,multi_screen,mail_subscribed,weekly_mins_watched,minimum_daily_mins,maximum_daily_mins,weekly_max_night_mins,videos_watched,maximum_days_inactive,customer_support_calls,churn
0,0,36,62,0,0,148.35,12.2,16.81,82,1,4.0,1,0.0
1,0,39,149,0,0,294.45,7.7,33.37,87,3,3.0,2,0.0


In [16]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

In [17]:
#dropping categorical columns and keeping numerical columns only
data_num = data.drop(["gender", "multi_screen", "mail_subscribed"], axis=1)

In [18]:
#scaling numericals columns
cols = data_num.columns
data_num = scaler.fit_transform(data_num)

In [19]:
data_num[0]

array([0.28125   , 0.25206612, 0.28192702, 0.61      , 0.28185781,
       0.30075188, 0.05263158, 0.66666667, 0.11111111, 0.        ])

In [20]:
#list of numerical columns
cols = list(cols)
cols

['age',
 'no_of_days_subscribed',
 'weekly_mins_watched',
 'minimum_daily_mins',
 'maximum_daily_mins',
 'weekly_max_night_mins',
 'videos_watched',
 'maximum_days_inactive',
 'customer_support_calls',
 'churn']

In [21]:
data[cols] = data_num
data.head(2)

Unnamed: 0,gender,age,no_of_days_subscribed,multi_screen,mail_subscribed,weekly_mins_watched,minimum_daily_mins,maximum_daily_mins,weekly_max_night_mins,videos_watched,maximum_days_inactive,customer_support_calls,churn
0,0,0.28125,0.252066,0,0,0.281927,0.61,0.281858,0.300752,0.052632,0.666667,0.111111,0.0
1,0,0.328125,0.61157,0,0,0.559578,0.385,0.559524,0.338346,0.157895,0.5,0.222222,0.0


In [22]:
data['churn'].value_counts()

churn
0.0    1665
1.0     253
Name: count, dtype: int64

*There is a clear class imbalance here,algorithms may struggle to properly learn and make accurate predictions for the minority class, leading to biased results and reduced performance*
- Synthetic Minority Oversampling Technique(SMOTE) method is used to address imbalance datasets

In [23]:
#Handling Class Imbalance
import imblearn #imbalanced learn
from imblearn.over_sampling import SMOTE

smote = SMOTE()

In [24]:
# fit predictor and target variable
x_smote, y_smote = smote.fit_resample(data.iloc[:,0:-1], data['churn'])

*fit_resample method fits the SMOTE model and applies it to resample the dataset & finally we get equilized data*

In [25]:
print('Original dataset shape', len(data))
print('Resampled dataset shape', len(y_smote))

Original dataset shape 1918
Resampled dataset shape 3330


In [26]:
#Checking after class balancing
y_smote.groupby(y_smote).size()

churn
0.0    1665
1.0    1665
Name: churn, dtype: int64

In [27]:
# split a dataset into train and test sets
from sklearn.model_selection import train_test_split

In [28]:
X_train, X_test, y_train, y_test = train_test_split(x_smote, y_smote, test_size=0.2, random_state=42,
                                                    stratify=y_smote)

In [29]:
print((y_test==0).sum())
print((y_test==1).sum())

333
333


In [30]:
print((y_train==0).sum())
print((y_train==1).sum())

1332
1332


### 4. Building Sequential Neural Network in Pytorch

In [31]:
X_train.shape

(2664, 12)

In [32]:
# Hyperparameters for our network
input_size = X_train.shape[1]
hidden_sizes = [128, 64]
output_size = 2


In [33]:
import torch
from torch import nn
# Build a feed-forward network
model = nn.Sequential(nn.Linear(input_size, hidden_sizes[0]), #12x128
                      nn.ReLU(),
                      nn.Linear(hidden_sizes[0], hidden_sizes[1]), #128x64
                      nn.ReLU(),
                      nn.Linear(hidden_sizes[1], output_size), #64x2
                      nn.Softmax(dim=1))
print(model)

Sequential(
  (0): Linear(in_features=12, out_features=128, bias=True)
  (1): ReLU()
  (2): Linear(in_features=128, out_features=64, bias=True)
  (3): ReLU()
  (4): Linear(in_features=64, out_features=2, bias=True)
  (5): Softmax(dim=1)
)


In [34]:
 #Negative Log Likelihood Loss commonly used for classification tasks
criterion = nn.NLLLoss()

In [35]:
from torch import optim
optimizer = optim.SGD(model.parameters(), lr=0.01)

In [36]:
X_train

Unnamed: 0,gender,age,no_of_days_subscribed,multi_screen,mail_subscribed,weekly_mins_watched,minimum_daily_mins,maximum_daily_mins,weekly_max_night_mins,videos_watched,maximum_days_inactive,customer_support_calls
1108,0,0.234375,0.305785,0,0,0.474059,0.305000,0.474011,0.285714,0.263158,0.333333,0.111111
1465,0,0.296875,0.264463,0,0,0.608324,0.135000,0.608317,0.609023,0.210526,0.166667,0.111111
296,0,0.093750,0.227273,0,1,0.346921,0.635000,0.346915,0.421053,0.157895,0.666667,0.111111
1100,0,0.390625,0.644628,0,1,0.549031,0.340000,0.548960,0.729323,0.263158,0.333333,0.333333
1489,1,0.218750,0.466942,1,1,0.497434,0.615000,0.497485,0.368421,0.526316,0.666667,0.222222
...,...,...,...,...,...,...,...,...,...,...,...,...
966,0,0.312500,0.268595,0,0,0.584664,0.460000,0.584675,0.398496,0.315789,0.500000,0.222222
2114,1,0.195527,0.535407,1,1,0.600423,0.735149,0.600439,0.463334,0.201995,0.779301,0.258244
2595,1,0.377156,0.409229,0,1,0.255055,0.543448,0.254975,0.291420,0.255896,0.500000,0.666667
1043,1,0.203125,0.636364,0,0,0.748005,0.415000,0.747988,0.323308,0.315789,0.500000,0.222222


In [37]:
y_train

1108    0.0
1465    0.0
296     0.0
1100    0.0
1489    0.0
       ... 
966     0.0
2114    1.0
2595    1.0
1043    0.0
2496    1.0
Name: churn, Length: 2664, dtype: float64

*converting data into tensor to pass it to pytorch sequential layer*

In [38]:
import torch.utils.data as Data
from torch import Tensor
import numpy as np

X_train = Tensor(X_train.values)
y_train = Tensor(np.array(y_train))

In [39]:
X_train

tensor([[0.0000, 0.2344, 0.3058,  ..., 0.2632, 0.3333, 0.1111],
        [0.0000, 0.2969, 0.2645,  ..., 0.2105, 0.1667, 0.1111],
        [0.0000, 0.0938, 0.2273,  ..., 0.1579, 0.6667, 0.1111],
        ...,
        [1.0000, 0.3772, 0.4092,  ..., 0.2559, 0.5000, 0.6667],
        [1.0000, 0.2031, 0.6364,  ..., 0.3158, 0.5000, 0.2222],
        [1.0000, 0.2923, 0.3645,  ..., 0.2976, 0.6106, 0.0748]])

In [40]:
y_train

tensor([0., 0., 0.,  ..., 1., 0., 1.])

In [41]:
BATCH_SIZE = 64 #number of samples fed into the neural network during each training iteration. 
#TensorDataset is a PyTorch class that allows you to create a dataset from tensors
torch_dataset = Data.TensorDataset(X_train, y_train) 

#loading data for the model
loader = Data.DataLoader(
    dataset=torch_dataset, 
    batch_size=BATCH_SIZE, 
    shuffle=True, num_workers=2,)
#Shuffles the data within the dataset before creating batches in each epoch. 
#Shuffling helps the model learn from different data combinations and improve generalization.
#number of worker threads to use for data loading. More is the number, more is the speed when working with large datasets or performing data augmentation.

In [42]:
import torch
from torch.autograd import Variable

epochs = 100 #No. of iterations over the intire dataset
for e in range(epochs):
    running_loss = 0
    for step, (batch_x, batch_y) in enumerate(loader):
        #Batch of training features & training labels
        b_x = batch_x  # No need for Variable in recent PyTorch versions
        b_y = batch_y.type(torch.LongTensor)  # Convert batch_y to LongTensor if needed
        
        # Training pass
        # Sets the gradients of the model's parameters to zero before each training iteration. This ensures gradients accumulate properly during backpropagation.
        optimizer.zero_grad()
        #Passes the batches of data via neural network to get predictions
        output = model(b_x)
        #Calculates the loss
        loss = criterion(output, b_y)
        #Performs backpropagation to calculate the gradients of the loss function with respect to the model's parameters.
        loss.backward()
        #Updates the parameters
        optimizer.step()
        # Accumulates the training loss for each batch within an epoch.
        running_loss += loss.item()
        
    else:
        #Average loss each epoch
        print(f"Training loss: {running_loss/len(X_train)}")

Training loss: -0.007886192506856031
Training loss: -0.007895764701806748
Training loss: -0.007903398436916483
Training loss: -0.007913328696031112
Training loss: -0.007921786839002604
Training loss: -0.007929170252503576
Training loss: -0.007936375165307845
Training loss: -0.007947716067682157
Training loss: -0.007954453685888657
Training loss: -0.007963289459188422
Training loss: -0.007971852939497601
Training loss: -0.00798415205813385
Training loss: -0.007990344452696878
Training loss: -0.008002885267720208
Training loss: -0.00801195593567582
Training loss: -0.008023128830813788
Training loss: -0.008034409636312776
Training loss: -0.008045327186852962
Training loss: -0.00805522372817492
Training loss: -0.008072458845269572
Training loss: -0.008082065608229366
Training loss: -0.008094349758582073
Training loss: -0.008110308734414814
Training loss: -0.008127732498867734
Training loss: -0.00814504401238115
Training loss: -0.008158082852850447
Training loss: -0.008176744275085919
Train

In [43]:
X_test_tensor = Tensor(X_test.values)
y_test = Tensor(np.array(y_test))
z = model(X_test_tensor)

In [44]:
from sklearn.metrics import accuracy_score

yhat = list(z.argmax(1))
y_test = list(y_test)

print("Accuracy Score of Test Data ",accuracy_score(y_test,yhat) * 100)



Accuracy Score of Test Data  75.22522522522522


- ***Now lets try to optimise it by Hyperparameter Tuning***

### 5. Hyperparameter Tuning of  Neural Network in Pytorch

#### a) Dropout
- Dropout is a regularization technique used in neural networks to prevent overfitting & improve generalization. During each training iteration, dropout randomly selects neurons and temporarily removes them (sets their outputs to zero).

In [45]:
# Hyperparameters for our network
input_size = X_train.shape[1]
hidden_sizes = [128, 64]
output_size = 2

In [46]:
model_dropout = nn.Sequential(nn.Linear(input_size, hidden_sizes[0]), #12x128
                      nn.Dropout(0.2),# During training, 20% of the neurons will be randomly set to 0.
                      nn.ReLU(),
                      nn.Linear(hidden_sizes[0], hidden_sizes[1]),#128x64
                      nn.Dropout(0.1),
                      nn.ReLU(),
                      nn.Linear(hidden_sizes[1], output_size), #64x2
                      nn.Softmax(dim=1))
print(model_dropout)

Sequential(
  (0): Linear(in_features=12, out_features=128, bias=True)
  (1): Dropout(p=0.2, inplace=False)
  (2): ReLU()
  (3): Linear(in_features=128, out_features=64, bias=True)
  (4): Dropout(p=0.1, inplace=False)
  (5): ReLU()
  (6): Linear(in_features=64, out_features=2, bias=True)
  (7): Softmax(dim=1)
)


In [47]:
 # Define the loss
criterion = nn.NLLLoss()
# Optimizers require the parameters to optimize and a learning rate
optimizer = optim.Adam(model_dropout.parameters(), lr=0.01)

In [48]:
epochs = 100 #No. of iterations over the entire dataset
for e in range(epochs):
    running_loss = 0
    for step, (batch_x, batch_y) in enumerate(loader):
        #Batch of training features & training labels
        b_x = batch_x  # No need for Variable in recent PyTorch versions
        b_y = batch_y.type(torch.LongTensor)  # Convert batch_y to LongTensor if needed
        
        # Training pass
        # Sets the gradients of the model's parameters to zero before each training iteration. This ensures gradients accumulate properly during backpropagation.
        optimizer.zero_grad()
        #Passes the batches of data via neural network to get predictions
        output = model_dropout(b_x)
        #Calculates the loss
        loss = criterion(output, b_y)
        #Performs backpropagation to calculate the gradients of the loss function with respect to the model's parameters.
        loss.backward()
        #Updates the parameters
        optimizer.step()
        # Accumulates the training loss for each batch within an epoch.
        running_loss += loss.item()
        
    else:
        #Average loss each epoch
        print(f"Training loss: {running_loss/len(X_train)}")

Training loss: -0.010128693213244458
Training loss: -0.011317282355762459
Training loss: -0.01159452499301584
Training loss: -0.011921010575852953
Training loss: -0.012008330373613684
Training loss: -0.012069616545070041
Training loss: -0.01221499292252658
Training loss: -0.012578861625702889
Training loss: -0.012572362974241332
Training loss: -0.01224568289321464
Training loss: -0.012570607322114366
Training loss: -0.012724863024087282
Training loss: -0.012547322669186749
Training loss: -0.012550937923583182
Training loss: -0.012772660199049357
Training loss: -0.01283361794085832
Training loss: -0.012770193519892994
Training loss: -0.012754591713259529
Training loss: -0.012906977431373196
Training loss: -0.012912501354475279
Training loss: -0.012743572416427257
Training loss: -0.012625824917365122
Training loss: -0.012434875173074705
Training loss: -0.012747671018849622
Training loss: -0.012821195496095193
Training loss: -0.012874188254008422
Training loss: -0.01304128692225293
Traini

In [49]:
from sklearn.metrics import accuracy_score

X_test_tensor = Tensor(X_test.values)
y_test = Tensor(np.array(y_test))
z = model_dropout(X_test_tensor)

yhat = list(z.argmax(1))
y_test = list(y_test)

print("Accuracy Score of Test Data ",accuracy_score(y_test,yhat) * 100)


Accuracy Score of Test Data  79.87987987987988


#### b) Regularization
- Regularization introduces a penalty(weight_decay) to the loss function during training to prevent overfitting.

In [50]:
# Hyperparameters for our network
input_size = X_train.shape[1]
hidden_sizes = [128, 64]
output_size = 2

In [51]:
model_reg = nn.Sequential(nn.Linear(input_size, hidden_sizes[0]), #12x128
                      nn.Dropout(0.2),# During training, 20% of the neurons will be randomly set to 0.
                      nn.ReLU(),
                      nn.Linear(hidden_sizes[0], hidden_sizes[1]),#128x64
                      nn.Dropout(0.1),
                      nn.ReLU(),
                      nn.Linear(hidden_sizes[1], output_size), #64x2
                      nn.Softmax(dim=1))
print(model_reg)

Sequential(
  (0): Linear(in_features=12, out_features=128, bias=True)
  (1): Dropout(p=0.2, inplace=False)
  (2): ReLU()
  (3): Linear(in_features=128, out_features=64, bias=True)
  (4): Dropout(p=0.1, inplace=False)
  (5): ReLU()
  (6): Linear(in_features=64, out_features=2, bias=True)
  (7): Softmax(dim=1)
)


In [52]:
 # Define the loss
criterion = nn.NLLLoss()
# Optimizers require the parameters to optimize and a learning rate
# Regularization
from torch import optim
# Optimizers require the parameters to optimize and a learning rate
# weight_decay adds a small penalty to the loss function to prevent the model from overfitting by discouraging large weights. 
optimizer = optim.Adam(model_reg.parameters(), lr=0.01, weight_decay=1e-5)


In [53]:
epochs = 100 #No. of iterations over the entire dataset
for e in range(epochs):
    running_loss = 0
    for step, (batch_x, batch_y) in enumerate(loader):
        #Batch of training features & training labels
        b_x = batch_x  # No need for Variable in recent PyTorch versions
        b_y = batch_y.type(torch.LongTensor)  # Convert batch_y to LongTensor if needed
        
        # Training pass
        # Sets the gradients of the model's parameters to zero before each training iteration. This ensures gradients accumulate properly during backpropagation.
        optimizer.zero_grad()
        #Passes the batches of data via neural network to get predictions
        output = model_reg(b_x)
        #Calculates the loss
        loss = criterion(output, b_y)
        #Performs backpropagation to calculate the gradients of the loss function with respect to the model's parameters.
        loss.backward()
        #Updates the parameters
        optimizer.step()
        # Accumulates the training loss for each batch within an epoch.
        running_loss += loss.item()
        
    else:
        #Average loss each epoch
        print(f"Training loss: {running_loss/len(X_train)}")

Training loss: -0.01036391161136083
Training loss: -0.011525721767464199
Training loss: -0.012193219327890838
Training loss: -0.012405028259074007
Training loss: -0.012133402822611926
Training loss: -0.012589739272007355
Training loss: -0.01269965911949719
Training loss: -0.012369676954574414
Training loss: -0.01248779246309498
Training loss: -0.012702682086297342
Training loss: -0.012598239444755577
Training loss: -0.012630803337147285
Training loss: -0.012803937244164693
Training loss: -0.012772959161986102
Training loss: -0.01284261235782692
Training loss: -0.012834543088176945
Training loss: -0.01280310532978705
Training loss: -0.012869143830584334
Training loss: -0.012852929174542069
Training loss: -0.012733954813208308
Training loss: -0.012714446150325798
Training loss: -0.012784814601903921
Training loss: -0.012760059208841296
Training loss: -0.012674218250645531
Training loss: -0.012891642704561309
Training loss: -0.012803253670175513
Training loss: -0.012856120484190303
Traini

In [54]:
from sklearn.metrics import accuracy_score

X_test_tensor = Tensor(X_test.values)
y_test = Tensor(np.array(y_test))
z = model_reg(X_test_tensor)

yhat = list(z.argmax(1))
y_test = list(y_test)

print("Accuracy Score of Test Data ",accuracy_score(y_test,yhat) * 100)


Accuracy Score of Test Data  81.08108108108108


### c) Early Stopping
- It involves monitoring the model's performance during training and it stops the training process when the performance starts to degrade.

In [55]:
# Hyperparameters for our network
input_size = X_train.shape[1]
hidden_sizes = [128, 64]
output_size = 2

In [56]:
model_early_stp = nn.Sequential(nn.Linear(input_size, hidden_sizes[0]), #12x128
                      nn.Dropout(0.2),# During training, 20% of the neurons will be randomly set to 0.
                      nn.ReLU(),
                      nn.Linear(hidden_sizes[0], hidden_sizes[1]),#128x64
                      nn.Dropout(0.1),
                      nn.ReLU(),
                      nn.Linear(hidden_sizes[1], output_size), #64x2
                      nn.Softmax(dim=1))
print(model_early_stp)

Sequential(
  (0): Linear(in_features=12, out_features=128, bias=True)
  (1): Dropout(p=0.2, inplace=False)
  (2): ReLU()
  (3): Linear(in_features=128, out_features=64, bias=True)
  (4): Dropout(p=0.1, inplace=False)
  (5): ReLU()
  (6): Linear(in_features=64, out_features=2, bias=True)
  (7): Softmax(dim=1)
)


In [57]:
 # Define the loss
criterion = nn.NLLLoss()
# Optimizers require the parameters to optimize and a learning rate
# Regularization
from torch import optim
# Optimizers require the parameters to optimize and a learning rate
optimizer = optim.Adam(model_early_stp.parameters(), lr=0.01)

In [58]:
epochs = 100
epochs_no_improve = 0
early_stop = False
min_loss = np.Inf
iter = 0

In [59]:
import torch
from torch.autograd import Variable
    
    
for e in range(epochs):
    running_loss = 0
    if early_stop:
        print("Stopped")
        break
    else:
        for step, (batch_x, batch_y) in enumerate(loader):#ensures that the model is trained on each batch of data in sequence.
            #number of iterations = len(X_train) // batch_size
            #Batch of training features & training labels
            b_x = batch_x  # 64 batch size
            b_y = batch_y.type(torch.LongTensor)  # 64 batch size
            
            # Training pass
            # Sets the gradients of the model's parameters to zero before each training iteration. This ensures gradients accumulate properly during backpropagation.
            optimizer.zero_grad()
            #Passes the batches of data via neural network to get predictions
            output = model_dropout(b_x)
            #Calculates the loss
            loss = criterion(output, b_y)
            #Performs backpropagation to calculate the gradients of the loss function with respect to the model's parameters.
            loss.backward()
            #Updates the parameters
            optimizer.step()
            # Accumulates the training loss for each batch within an epoch.
            running_loss += loss.item()
    
    
            if abs(running_loss) < abs(min_loss):
                #epochs_no_improve is initialized to 0 at the start of training. 
                #This variable keeps track of how many consecutive epochs have passed without improvement in the loss.
                epochs_no_improve = 0
                min_loss = running_loss
            else:
                epochs_no_improve +=1
                iter += 1
    
            #e > 5 sets threshold for when to start monitoring
            if e > 5 and epochs_no_improve ==  epochs:
                print('Early stopping!')
                #Checks if the epochs_no_improve counter  is equal to the total number of training epochs (epochs). 
                #If this is true, it suggests no improvement for all epochs.
                early_stop = True
                break
            else:
                continue
        else:
             print(f"Training loss: {running_loss/len(X_train)}")

Training loss: -0.013012354505491687
Training loss: -0.012883150854984203
Training loss: -0.012884700128266046
Training loss: -0.01285994117771899
Training loss: -0.012886743086415369
Training loss: -0.01291240465354633
Training loss: -0.012960425345926313
Training loss: -0.01292462987047774
Training loss: -0.012828754031801367
Training loss: -0.012892814459385457
Training loss: -0.012816585309512622
Training loss: -0.012878406065720337
Training loss: -0.012843759366878876
Training loss: -0.012878607947368163
Training loss: -0.01292590508321384
Training loss: -0.012791756059493389
Training loss: -0.012829590063016335
Training loss: -0.012978931827409132
Training loss: -0.01283266182776328
Early stopping!
Stopped


In [60]:
from sklearn.metrics import accuracy_score

X_test_tensor = Tensor(X_test.values)
y_test = Tensor(np.array(y_test))
z = model_early_stp(X_test_tensor)

yhat = list(z.argmax(1))
y_test = list(y_test)

print("Accuracy Score of Test Data ",accuracy_score(y_test,yhat) * 100)


Accuracy Score of Test Data  50.0


### Checkpoint (Loading and saving model)

In [61]:
# Hyperparameters for our network
input_size = X_train.shape[1]
hidden_sizes = [128, 64]
output_size = 2

In [62]:
model_chk = nn.Sequential(nn.Linear(input_size, hidden_sizes[0]), #12x128
                      nn.Dropout(0.2),# During training, 20% of the neurons will be randomly set to 0.
                      nn.ReLU(),
                      nn.Linear(hidden_sizes[0], hidden_sizes[1]),#128x64
                      nn.Dropout(0.1),
                      nn.ReLU(),
                      nn.Linear(hidden_sizes[1], output_size), #64x2
                      nn.Softmax(dim=1))
print(model_chk)

Sequential(
  (0): Linear(in_features=12, out_features=128, bias=True)
  (1): Dropout(p=0.2, inplace=False)
  (2): ReLU()
  (3): Linear(in_features=128, out_features=64, bias=True)
  (4): Dropout(p=0.1, inplace=False)
  (5): ReLU()
  (6): Linear(in_features=64, out_features=2, bias=True)
  (7): Softmax(dim=1)
)


In [63]:
 # Define the loss
criterion = nn.NLLLoss()
# Optimizers require the parameters to optimize and a learning rate
# Regularization
from torch import optim
# Optimizers require the parameters to optimize and a learning rate
# add l2 regularization to optimzer by just adding in a weight_decay 
optimizer = optim.Adam(model_chk.parameters(), lr=0.01, weight_decay=1e-5)

In [64]:
epochs = 100
path = "../model"

In [65]:
import torch
from torch.autograd import Variable


for e in range(epochs):
    running_loss = 0
    for step, (batch_x, batch_y) in enumerate(loader):

        b_x = batch_x  # 64 batch size
        b_y = batch_y.type(torch.LongTensor)  # 64 batch size
        
        # Training pass
        optimizer.zero_grad()
        
        output = model_chk(b_x)
        loss = criterion(output, b_y)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
        #saves the model
        torch.save({
            'epoch': e,
            'model_state_dict': model_chk.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': running_loss,
            }, path+"model_"+str(e)+".pt")
    else:
        print(f"Training loss: {running_loss/len(X_train)}")

Training loss: -0.010177241498464579
Training loss: -0.011243303952453373
Training loss: -0.011435661968347189
Training loss: -0.01191196304273319
Training loss: -0.012231950242598136
Training loss: -0.0122596951084094
Training loss: -0.012414836624005178
Training loss: -0.01247640316550796
Training loss: -0.012614822378745666
Training loss: -0.01254699296421475
Training loss: -0.012681254820601718
Training loss: -0.01262057702373098
Training loss: -0.012788266301513076
Training loss: -0.012804617842396459
Training loss: -0.012511991680384398
Training loss: -0.01277832995663892
Training loss: -0.012733489230230407
Training loss: -0.012753739885918729
Training loss: -0.012797027669690392
Training loss: -0.012704536743887194
Training loss: -0.012825142334889364
Training loss: -0.012743981795626002
Training loss: -0.012567154123439445
Training loss: -0.012860982603317982
Training loss: -0.012912734671755953
Training loss: -0.012496505010951389
Training loss: -0.012843794829852588
Training

In [66]:
model_load = model_chk
optimizer = optim.Adam(model_reg.parameters(), lr=0.01, weight_decay=1e-5)

checkpoint = torch.load(path+"model_2.pt")
model_load.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
epoch = checkpoint['epoch']
loss = checkpoint['loss']

model_load.eval()


Sequential(
  (0): Linear(in_features=12, out_features=128, bias=True)
  (1): Dropout(p=0.2, inplace=False)
  (2): ReLU()
  (3): Linear(in_features=128, out_features=64, bias=True)
  (4): Dropout(p=0.1, inplace=False)
  (5): ReLU()
  (6): Linear(in_features=64, out_features=2, bias=True)
  (7): Softmax(dim=1)
)

In [67]:
#Accuracy of the test data
from sklearn.metrics import accuracy_score

X_test_tensor = Tensor(X_test.values)
y_test = Tensor(np.array(y_test))
z = model_load(X_test_tensor)

yhat = list(z.argmax(1))
y_test = list(y_test)

print("Accuracy Score of Test Data ",accuracy_score(y_test,yhat) * 100)


Accuracy Score of Test Data  70.57057057057057
