In [35]:
import pandas as pd
import torch
import torch.nn as nn
from torch.optim import Adam, SGD
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [36]:
data = pd.read_csv("diabetes.csv")

data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [37]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [38]:
data.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [39]:
X = data.drop('Outcome', axis=1)
y = data['Outcome'].values.reshape(-1, 1)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

X_train_t = torch.tensor(X_train, dtype=torch.float32)
X_test_t = torch.tensor(X_test, dtype=torch.float32)
y_train_t = torch.tensor(y_train, dtype=torch.float32)
y_test_t = torch.tensor(y_test, dtype=torch.float32)

class DiabetesModel(nn.Module):
    def __init__(self):
        super(DiabetesModel, self).__init__()
        self.fc1 = nn.Linear(X_train.shape[1], 64)  
        self.fc2 = nn.Linear(64, 1)  

    def forward(self, x):
        x = torch.relu(self.fc1(x))  
        x = self.fc2(x)
        return x

model = DiabetesModel()

optimizer_adam = Adam(model.parameters(), lr=0.001)

optimizer_sgd = SGD(model.parameters(), lr=0.001)

loss_fn = nn.MSELoss()

def train_model(model, optimizer, X_train, y_train, X_test, y_test, epochs=100):
    model.train()
    for epoch in range(epochs):
        optimizer.zero_grad()
        predictions = model(X_train)
        loss = loss_fn(predictions, y_train)
        loss.backward()
        optimizer.step()

    model.eval()
    with torch.no_grad():
        test_predictions = model(X_test)
        test_loss = loss_fn(test_predictions, y_test)
    return test_loss.item()

loss_adam = train_model(model, optimizer_adam, X_train_t, y_train_t, X_test_t, y_test_t)

model = DiabetesModel()

loss_sgd = train_model(model, optimizer_sgd, X_train_t, y_train_t, X_test_t, y_test_t)

print(f"Loss with Adam: {loss_adam}")
print(f"Loss with SGD: {loss_sgd}")

Loss with Adam: 0.1732465624809265
Loss with SGD: 0.6199387311935425


Adam optimization algorithm as implemented in PyTorch is an extension to stochastic gradient descent and has found widespread application in deep learning for different types of neural networks. Adam combines the best aspects of the AdaGrad and RMSProp algorithms, thereby providing an optimization algorithm that can handle noisy problems with sparse gradients.

Adaptive learning rates are calculated for each parameter by Adam. Besides storing an exponentially decaying average of past squared gradients like RMSProp, it does also keep a record of exponential decay average of previous gradients like momentum. This is a combination of adaptive learning rate optimization algorithms and momentum optimization algorithms.

#### The key elements include:

First Moment Estimation (m): It is essentially the mean (momentum) of the gradients, which helps Adam to accelerate faster towards the relevant directions during convergence.

Second Moment Estimation (v): This keeps track of the uncentered variance of the gradients. From this information, it adjusts coefficients’ learning rate whereby large gradients yield smaller values while small ones generate larger values.

Bias Correction: During its update calculations, Adam also involves bias correction terms to counteract initial zero-biasing tendencies when iterating through time.

These components collectively enable robustness against vanishing learning rate, strong dependence on initial setting and fluctuating optimization problem hence making it suitable for addressing sparse or noisy gradient problems.

#### Performance with a Different Optimizer:

In our practical experiment that made use of diabetes dataset we compared Adam with Stochastic Gradient Descent (SGD) optimizer. The model trained with Adam obtained a loss value of 0.1673 while SGD gave us 0.3604 loss value after training.

#### Comparison and Reasoning:

Performance: In this case, Adam performed better than SGD.

Reason: Adam’s strength lies in its ability to efficiently handle complex data sets that have highly varying gradients due to their function landscape complexity. Unlike other methods, one common feature underlines all these Godot versions; an adaptive learning rate. This feature is particularly useful where the data has widely ranging scales and distributions such as in the diabetes dataset.

On the other hand, even though simpler and often with good generalization, SGD does not adaptively adjust the learning rates which can result in nonoptimal learning path compared to Adam especially when faced with complex datasets whose different features may have different learning rates.

Therefore, Adam outperformed SGD in this experiment because it uses adaptive learning rates that are better suited for input features of different scales than SGD’s uniform update schedule. This kind of flexibility typically leads to quicker convergence, especially on datasets like the diabetes dataset that have different scales and distributions across their features.

question2.Write a function that lists and counts the number of divisors for an input value.Example 1:Input: 5
Output: “There are 2 divisors: 1 and 5”Example 2:Input: 40
Output: “There are 8 divisors: 1, 2, 4, 5, 8, 10, 20, and 40”

In [41]:
def list_divisors(n: int) -> str:
    divisors = []
    for i in range(1, n + 1):
        if n % i == 0:
            divisors.append(i)
    num_divisors = len(divisors)
    divisors_str = ", ".join(map(str, divisors[:-1])) + ", and " + str(divisors[-1])
    return f"There are {num_divisors} divisors: {divisors_str}"


print(list_divisors(5))
print(list_divisors(40))


There are 2 divisors: 1, and 5
There are 8 divisors: 1, 2, 4, 5, 8, 10, 20, and 40
