In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision.transforms as transforms
import torchvision.datasets as datasets
from torch.utils.data import DataLoader
import pandas as pd
import os
from sklearn.metrics import accuracy_score
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print (device)

cpu


In [3]:
# Downloading and preprocessing the data
transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

batch_size = 4

train_dataset = datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=2)

test_dataset = datasets.CIFAR10(root='./data', train=False, download=True, transform=transform)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=2)

classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data\cifar-10-python.tar.gz


100.0%


Extracting ./data\cifar-10-python.tar.gz to ./data
Files already downloaded and verified


In [4]:
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(3, 6, 5)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(16 * 5 * 5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = torch.flatten(x, 1) # flatten all dimensions except batch
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x


net = Net()
net.to(device)

Net(
  (conv1): Conv2d(3, 6, kernel_size=(5, 5), stride=(1, 1))
  (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv2d(6, 16, kernel_size=(5, 5), stride=(1, 1))
  (fc1): Linear(in_features=400, out_features=120, bias=True)
  (fc2): Linear(in_features=120, out_features=84, bias=True)
  (fc3): Linear(in_features=84, out_features=10, bias=True)
)

In [5]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

In [6]:
for epoch in range(3): 
    running_loss = 0.0
    for i, data in enumerate(train_loader, 0):
        inputs, labels = data[0].to(device), data[1].to(device)
        optimizer.zero_grad()
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        if i % 2000 == 1999:    # print every 2000 mini-batches
            print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.3f}')
            running_loss = 0.0
print('Finished Training')

[1,  2000] loss: 2.228
[1,  4000] loss: 1.919
[1,  6000] loss: 1.698
[1,  8000] loss: 1.614
[1, 10000] loss: 1.536
[1, 12000] loss: 1.470
[2,  2000] loss: 1.405
[2,  4000] loss: 1.386
[2,  6000] loss: 1.353
[2,  8000] loss: 1.313
[2, 10000] loss: 1.307
[2, 12000] loss: 1.274
[3,  2000] loss: 1.227
[3,  4000] loss: 1.218
[3,  6000] loss: 1.214
[3,  8000] loss: 1.186
[3, 10000] loss: 1.160
[3, 12000] loss: 1.152
Finished Training


In [7]:
PATH = './cifar_net.pth'
torch.save(net.state_dict(), PATH)

In [8]:
net = Net()
net.load_state_dict(torch.load('cifar_net.pth'))
net.to(device)

  net.load_state_dict(torch.load('cifar_net.pth'))


Net(
  (conv1): Conv2d(3, 6, kernel_size=(5, 5), stride=(1, 1))
  (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv2d(6, 16, kernel_size=(5, 5), stride=(1, 1))
  (fc1): Linear(in_features=400, out_features=120, bias=True)
  (fc2): Linear(in_features=120, out_features=84, bias=True)
  (fc3): Linear(in_features=84, out_features=10, bias=True)
)

In [9]:
correct = 0
total = 0
with torch.no_grad():
    for data in test_loader:
        images, labels = data[0].to(device), data[1].to(device)
        outputs = net(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
print(f'Accuracy of the network on the 10000 test images: {100 * correct // total} %')

Accuracy of the network on the 10000 test images: 59 %


In [10]:
correct_pred = {classname: 0 for classname in classes}
total_pred = {classname: 0 for classname in classes}

with torch.no_grad():
    for data in test_loader:
        images, labels = data[0].to(device), data[1].to(device)
        outputs = net(images)
        _, predictions = torch.max(outputs, 1)
        for label, prediction in zip(labels, predictions):
            if label == prediction:
                correct_pred[classes[label]] += 1
            total_pred[classes[label]] += 1

for classname, correct_count in correct_pred.items():
    accuracy = 100 * float(correct_count) / total_pred[classname]
    print(f'Accuracy for class: {classname:5s} is {accuracy:.1f} %')

Accuracy for class: plane is 53.1 %
Accuracy for class: car   is 76.7 %
Accuracy for class: bird  is 44.7 %
Accuracy for class: cat   is 44.2 %
Accuracy for class: deer  is 39.0 %
Accuracy for class: dog   is 47.4 %
Accuracy for class: frog  is 77.1 %
Accuracy for class: horse is 70.7 %
Accuracy for class: ship  is 75.8 %
Accuracy for class: truck is 64.8 %


In [11]:
# Feature extraction algorithm
def extract_features(loader, model, dataset):
    features = []
    filenames = []

    model.eval()
    with torch.no_grad():
        for images, labels in loader:
            images = images.to(device)
            outputs = model(images)
            features.append(outputs)
            filenames.extend(list(dataset.data[:len(filenames)]))
    features = torch.cat(features, dim=0)

    return features, filenames

In [12]:
train_features, train_filenames = extract_features(train_loader, net, train_dataset)
test_features, test_filenames = extract_features(test_loader, net, test_dataset)

In [13]:
# Creating DataFrame for train set
train_df = pd.DataFrame(train_features.cpu().numpy())

# Renam columns '0', '1', '2', ... to 'feature_Y' for train set
train_df.columns = [f'feature_{i}' for i in range(train_df.shape[1])]

# Adding a 'Case_Number' column with instance names for train set
train_df.insert(0, 'Case_Number', [f'Case_{i}' for i in range(train_df.shape[0])])

# Adding a 'target' column with CIFAR-10 target classes for train set
train_df['target'] = train_dataset.targets

# Get unique values from the 'target' column
unique_targets = sorted(train_df['target'].unique())

# Create new columns for each unique 'target' value
for target in unique_targets:
    train_df[target] = (train_df['target'] == target).astype(int)

# Cut out the column with the 'target' header
target_column = train_df.pop('target')

# Insert it at the last position
train_df['target'] = target_column

# Saving train set features to a CSV file
train_df.to_csv('./train_features.csv', index=False)

# Creating DataFrame for test set
test_df = pd.DataFrame(test_features.cpu().numpy())

# Rename columns '0', '1', '2', ... to 'feature_Y' for test set
test_df.columns = [f'feature_{i}' for i in range(test_df.shape[1])]

# Adding a 'Case_Number' column with instance names for test set
test_df.insert(0, 'Case_Number', [f'Case_{i}' for i in range(test_df.shape[0])])

# Adding a 'target' column with CIFAR-10 target classes for test set
test_df['target'] = test_dataset.targets

# Get unique values from the 'target' column
unique_targets = sorted(test_df['target'].unique())

# Create new columns for each unique 'target' value
for target in unique_targets:
    test_df[target] = (test_df['target'] == target).astype(int)

# Cut out the column with the 'target' header
target_column = test_df.pop('target')

# Insert it at the last position
test_df['target'] = target_column

# Saving test set features to a CSV file
test_df.to_csv('./test_features.csv', index=False)

print("Features are saved in train_features.csv and test_features.csv")

Features are saved in train_features.csv and test_features.csv


In [14]:
# Specify the path to the CSV files
csv_train_file_path = "train_features.csv"
csv_test_file_path = "test_features.csv"

# Check if file exists
if not os.path.exists(csv_train_file_path):
    print(f"File {csv_train_file_path} does not exist.")
else:
    # Upload CSV file
    df = pd.read_csv(csv_train_file_path)

    # Get file size in bytes
    file_size_bytes = os.path.getsize(csv_train_file_path)

    # Convert size to megabytes
    file_size_mb = file_size_bytes / (1024 * 1024)

    # Get information about the number of rows and columns
    num_rows, num_columns = df.shape

    # Print information
    print(f"Train file size: {file_size_mb:.2f} MB")
    print(f"Number of lines in train file: {num_rows}")
    print(f"Number of columns in train file: {num_columns}")

# Check if file exists
if not os.path.exists(csv_test_file_path):
    print(f"File {csv_test_file_path} does not exist.")
else:
    # Upload CSV file
    df = pd.read_csv(csv_test_file_path)

    # Get file size in bytes
    file_size_bytes = os.path.getsize(csv_test_file_path)

    # Convert size to megabytes
    file_size_mb = file_size_bytes / (1024 * 1024)

    # Get information about the number of rows and columns
    num_rows, num_columns = df.shape

    # Print information
    print(f"\nTest file size: {file_size_mb:.2f} MB")
    print(f"Number of lines in test file: {num_rows}")
    print(f"Number of columns in test file: {num_columns}")
    print(f"\nFeatures from 1 to {num_columns-11}")

Train file size: 6.64 MB
Number of lines in train file: 50000
Number of columns in train file: 22

Test file size: 1.32 MB
Number of lines in test file: 10000
Number of columns in test file: 22

Features from 1 to 11


In [15]:
#LDA
# Load training data from a CSV file
csv_train_file_path = "test_features.csv"
train_data = pd.read_csv('train_features.csv')

# Load test data from a separate CSV file
test_data = pd.read_csv('test_features.csv')

# Prepare training data
X_train = train_data.iloc[:, 1:11]  # Select features
y_train = train_data['target']  # Target variable

# Prepare test data
X_test = test_data.iloc[:, 1:11]  # Select features
y_test = test_data['target']  # Target variable

# Train the LDA model on the training data
lda = LinearDiscriminantAnalysis()
lda.fit(X_train, y_train)

# Predict classes for the training data
y_train_pred = lda.predict(X_train)

# Predict classes for the test data
y_pred = lda.predict(X_test)

# Evaluate model accuracy on the training data
train_accuracy = accuracy_score(y_train, y_train_pred)
train_error_rate = 1 - train_accuracy

# Evaluate model accuracy on the test data
accuracy = accuracy_score(y_test, y_pred)
error_rate = 1 - accuracy

print(f'Accuracy on the training data: {train_accuracy * 100:.2f}%')
print(f'Error rate on the training data: {train_error_rate * 100:.2f}%')
print(f'Accuracy on the test data: {accuracy * 100:.2f}%')
print(f'Error rate on the test data: {error_rate * 100:.2f}%')

# Count the number of elements in each predicted class for the training data
predicted_class_counts_train = pd.Series(y_train_pred).value_counts().sort_index()
print("\nPredicted class counts in training data:")
print(predicted_class_counts_train)

# Count the number of elements in each predicted class for the test data
predicted_class_counts_test = pd.Series(y_pred).value_counts().sort_index()
print("\nPredicted class counts in test data:")
print(predicted_class_counts_test)

Accuracy on the training data: 10.60%
Error rate on the training data: 89.40%
Accuracy on the test data: 12.70%
Error rate on the test data: 87.30%

Predicted class counts in training data:
0    4649
1    5670
2    2641
3    6305
4    2067
5    5543
6    7961
7    3305
8    5930
9    5929
Name: count, dtype: int64

Predicted class counts in test data:
0     938
1    1180
2     520
3    1278
4     423
5    1104
6    1607
7     646
8    1148
9    1156
Name: count, dtype: int64


In [16]:
#Logistic Regression
# Load training data from a CSV file
train_data = pd.read_csv('train_features.csv')

# Load test data from a separate CSV file
test_data = pd.read_csv('test_features.csv')

# Prepare training data
X_train = train_data.iloc[:, 1:11]  # Select features
y_train = train_data['target']  # Target variable

# Prepare test data
X_test = test_data.iloc[:, 1:11]  # Select features
y_test = test_data['target']  # Target variable

# Train the Logistic Regression model on the training data
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)

# Predict classes for the training data
y_train_pred = lr.predict(X_train)

# Predict classes for the test data
y_pred = lr.predict(X_test)

# Evaluate model accuracy on the training data
train_accuracy = accuracy_score(y_train, y_train_pred)
train_error_rate = 1 - train_accuracy

# Evaluate model accuracy on the test data
accuracy = accuracy_score(y_test, y_pred)
error_rate = 1 - accuracy

print(f'Accuracy on the training data: {train_accuracy * 100:.2f}%')
print(f'Error rate on the training data: {train_error_rate * 100:.2f}%')
print(f'Accuracy on the test data: {accuracy * 100:.2f}%')
print(f'Error rate on the test data: {error_rate * 100:.2f}%')

# Count the number of elements in each predicted class for the training data
predicted_class_counts_train = pd.Series(y_train_pred).value_counts().sort_index()
print("\nPredicted class counts in training data:")
print(predicted_class_counts_train)

# Count the number of elements in each predicted class for the test data
predicted_class_counts_test = pd.Series(y_pred).value_counts().sort_index()
print("\nPredicted class counts in test data:")
print(predicted_class_counts_test)

Accuracy on the training data: 10.62%
Error rate on the training data: 89.38%
Accuracy on the test data: 12.50%
Error rate on the test data: 87.50%

Predicted class counts in training data:
0    4540
1    5872
2    2651
3    6395
4    1991
5    5272
6    8113
7    3350
8    5898
9    5918
Name: count, dtype: int64

Predicted class counts in test data:
0     908
1    1220
2     521
3    1288
4     412
5    1041
6    1645
7     656
8    1150
9    1159
Name: count, dtype: int64
