# Pytorch with a Pandas DataFrame input (tabulated iris data)



A Dataloader is a Pytorch class that allows for easy training and inference, therefore whenever a Pandas dataframe is used as input, it must converted to a Dataloader object ie. a class that includes an __init__, __len__, and __getitem__ function. Since the __getitem__ only returns the index of the row as a Pandas class, it must be converted to a torch.tensor(). This can be done either in the class itself or when creating the object (see below).



'''

class PandasPytorchDataLoader(Dataset):

    def __init__(self, data):

        super().__init__()

        self.data = data

    def __len__(self):

        return len(self.data)
    
    def __getitem__(self, idx):
    
        return self.data.iloc[idx]
'''

then feed your scaled features and label encoded classes to TensorDataset

model = IrisClassifier(in_features=train_X.shape[1], out_features=64)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)

train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)


In [1]:
# Import Packages 
import pandas as pd
import seaborn as sns
from torch import nn
from torch.utils.data import Dataset, DataLoader, TensorDataset
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from torch.nn import Linear, Embedding, CrossEntropyLoss
from torch.nn.functional import cross_entropy
import torch.optim as optim
from sklearn.metrics import accuracy_score

In [3]:
np.random.seed(99)
df = sns.load_dataset('iris')
df.head()
# type(df)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [None]:
# Shuffle dataset
df = df.sample(frac=1, ignore_index=True)
df.head()

In [None]:

# Determine unique labels
species_class_labels = {'virginica': 2, 'versicolor': 1, 'setosa': 0}
df['species'].unique()

In [None]:
# Normalize features
scaler = StandardScaler()
features = scaler.fit_transform(df.iloc[:,:-1])
features.shape

In [None]:
# Label Encode the species category
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(df.iloc[:,-1])
labels.shape
labels
type(features), features.dtype, features.shape, type(labels), labels.dtype, labels.shape

In [None]:
# Create a 60-20-20 split between training, cross validation, and testin
df_size = len(df)
training_size = int(df_size * .8)
testing_size = int(df_size - training_size)
training_size, testing_size

In [None]:
# Split the data into training and testing arrays
train_X, train_y = features[:training_size+1], labels[:training_size+1]
test_X, test_y = features[training_size+1:], labels[training_size+1:]

train_test_data = [train_X, train_y, test_X, test_y]

for d in train_test_data:
    print(type(d), d.shape)

In [None]:
# Convert numpy arrays to tensors

train_X_tensor = torch.tensor(train_X, dtype=torch.float32)
train_y_tensor = torch.tensor(train_y, dtype=torch.int64)

test_X_tensor = torch.tensor(test_X, dtype=torch.float32)
test_y_tensor = torch.tensor(test_y, dtype=torch.int64)
test_y_tensor.shape, type(test_y_tensor), test_y_tensor.dtype, len(test_y_tensor), test_X_tensor.shape, type(test_X_tensor), test_X_tensor.dtype, len(test_X_tensor)

In [None]:
# Create Pytorch Model

class IrisClassifier(nn.Module):
    def __init__(self, in_features: int, out_features: int):
        super().__init__()
        # self.embedding_layer = Embedding(num_embeddings, embedding_dim)
        self.linear_layer_1 = Linear(in_features=in_features, out_features=out_features)
        self.linear_layer_2 = Linear(in_features=out_features, out_features=3)

    def forward(self, features):
        x = self.linear_layer_1(features)
        y = self.linear_layer_2(x)
        return y

In [None]:
# Instatiate model
model = IrisClassifier(in_features=train_X.shape[1], out_features=64)
model
model.parameters

In [None]:
# Establish loss, optimizer, epochs, and batch_size

epochs = 10

batch_size = 32

# loss_2 = cross_entropy(input=, target=, )
loss_cel = CrossEntropyLoss()

optimizer = optim.Adam(params=model.parameters(), lr=1e-5)


In [None]:
# Utilize the data loader class for both BATCH training 


# ??? and testing tensors to ensure the data complies with Pytorch
train_dataset = TensorDataset(train_X_tensor, train_y_tensor)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
# tester_dataloader = DataLoader((test_X_tensor, test_y_tensor), batch_size=32, shuffle=True)

In [None]:

# Train the model
for epoch in range(epochs):
    running_loss = 0.0
    for batch_features, labels_features in train_dataloader:

        # resets the gradients of all the parameters to zero so no mini-batch gradients will after the next traiing step on a new set of batch data
        optimizer.zero_grad()
 
        # outputs
        logits = model(batch_features)

        # loss = cross_entropy(logits[0], target=labels_features[0])
        loss = loss_cel(logits[0], target=labels_features[0])

        loss.backward()

        optimizer.step()

        running_loss += loss.item()
    print(f'Epoch: {epoch}, Loss: {loss.item()}, Running Loss: {running_loss}')

In [None]:
# Testing the model 

with torch.no_grad():
    test_logits = model(test_X_tensor)

    _, predicted = torch.max(test_logits, 1)

    accuracy = accuracy_score(y_true=test_y_tensor.numpy(), y_pred=predicted)

    print(f'Accuracy: {accuracy}')
predicted

In [None]:

# Save model 

torch.save(model.state_dict(), 'iris_pytorch_classifier.pth')

In [None]:
# Load a locally saved model

loaded_model = torch.load('iris_pytorch_classifier.pth')
loaded_model
type(loaded_model), len(loaded_model)
predicted
test_y_tensor
accuracy_count = 0

In [None]:
# for y_pred,  y_test in zip(pred_y, test_y_tensor):
for y_pred,  y_test in zip(predicted, test_y_tensor):
    print(f"Predicted Label: {y_pred},    True Label: {y_test}")
    if y_pred == y_test:
        accuracy_count += 1
print(accuracy_count/len(predicted))
