# Data Pre-Processing for ML Classification

***
# Jupyter Notebook where:
1. CSV containing surface potentials will be imported using pandas
1. pyTorch's Dataloader will be used to iterate through the data in bratches so it can easily be passed into a model 

***
# Import Libraries

In [1]:
import csv
import os
import pandas as pd
import torch
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split

# Read Data from CSV into Dataframe

In [2]:
target_file="v1_vals.csv"

In [3]:
df_v1=pd.read_csv(target_file,header=None)

In [4]:
display(df_v1)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,183,184,185,186,187,188,189,190,191,192
0,0.137718,0.067111,0.043241,0.032551,0.027374,0.025208,0.025204,0.027371,0.032547,0.043220,...,0.032547,0.027371,0.025203,0.025208,0.027375,0.032549,0.043237,0.067096,0.137833,0
1,0.137718,0.067111,0.043241,0.032551,0.027374,0.025208,0.025204,0.027371,0.032547,0.043220,...,0.032547,0.027371,0.025203,0.025208,0.027375,0.032549,0.043237,0.067096,0.137833,0
2,0.137718,0.067111,0.043241,0.032551,0.027374,0.025208,0.025204,0.027371,0.032547,0.043220,...,0.032547,0.027371,0.025203,0.025208,0.027375,0.032549,0.043237,0.067096,0.137833,0
3,0.137718,0.067111,0.043241,0.032551,0.027374,0.025208,0.025204,0.027371,0.032547,0.043220,...,0.032547,0.027371,0.025203,0.025208,0.027375,0.032549,0.043237,0.067096,0.137833,0
4,0.137718,0.067111,0.043241,0.032551,0.027374,0.025208,0.025204,0.027371,0.032547,0.043220,...,0.032547,0.027371,0.025203,0.025208,0.027375,0.032549,0.043237,0.067096,0.137833,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
595,0.119718,0.055783,0.035180,0.025439,0.021597,0.021048,0.021705,0.023936,0.028872,0.039054,...,0.030226,0.024914,0.023633,0.025023,0.027795,0.033262,0.044308,0.068835,0.141044,2
596,0.139020,0.068304,0.044483,0.034053,0.029279,0.023013,0.022025,0.028498,0.033671,0.039048,...,0.033987,0.029001,0.027118,0.022673,0.023821,0.033173,0.042378,0.056662,0.123070,2
597,0.140985,0.070218,0.046737,0.027808,0.019664,0.025148,0.026766,0.027829,0.031599,0.039557,...,0.033592,0.021519,0.017030,0.023376,0.026443,0.029934,0.038110,0.057163,0.120319,2
598,0.133108,0.063483,0.040348,0.022209,0.017667,0.023264,0.022729,0.023893,0.027869,0.036653,...,0.029585,0.018381,0.017523,0.024839,0.026738,0.031514,0.042070,0.066136,0.138276,2


In [5]:
print("Number of DataPoints=",len(df_v1))

Number of DataPoints= 600


In [6]:
for tumor_count in set(df_v1.iloc[:,-1]):
    print("Datapoints for tumor count ",tumor_count,"=",len(df_v1[df_v1.iloc[:,-1]==tumor_count]))

Datapoints for tumor count  0 = 200
Datapoints for tumor count  1 = 200
Datapoints for tumor count  2 = 200


# Seperate data into Training and Test Data

In [7]:
# all data except last column
X=df_v1.iloc[:,:-1]

# only last column
y=df_v1.iloc[:,-1]

In [8]:
X_train,X_test,y_train,y_test=train_test_split(X,y, test_size=0.2,random_state=42)

In [9]:
print("Training set - X:", X_train.shape, "y:", y_train.shape)
print("Test set - X:", X_test.shape, "y:", y_test.shape)

Training set - X: (480, 192) y: (480,)
Test set - X: (120, 192) y: (120,)


# Create PyTorch Tensor Instances

In [10]:
# Convert DataFrame to PyTorch tensors
X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)

y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32)

In [11]:
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

# Create DataLoader Instances

In [12]:
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

# Viewing the Data in the DataLoader instances

In [13]:
for batch_idx, (inputs, targets) in enumerate(train_loader):
    print(f"Batch {batch_idx}:")
    print("Inputs (features):", inputs)
    print("Targets (labels):", targets)

Batch 0:
Inputs (features): tensor([[0.1377, 0.0671, 0.0432,  ..., 0.0432, 0.0671, 0.1378],
        [0.1377, 0.0671, 0.0432,  ..., 0.0432, 0.0671, 0.1378],
        [0.1377, 0.0671, 0.0432,  ..., 0.0432, 0.0671, 0.1378],
        ...,
        [0.1329, 0.0549, 0.0348,  ..., 0.0444, 0.0684, 0.1394],
        [0.1377, 0.0671, 0.0432,  ..., 0.0432, 0.0671, 0.1378],
        [0.1377, 0.0671, 0.0432,  ..., 0.0432, 0.0671, 0.1378]])
Targets (labels): tensor([0., 0., 0., 1., 1., 1., 1., 2., 2., 0., 2., 2., 0., 0., 1., 2., 1., 0.,
        2., 0., 1., 2., 1., 1., 1., 0., 2., 2., 1., 2., 2., 2., 2., 0., 0., 2.,
        0., 2., 1., 1., 1., 2., 2., 0., 2., 0., 0., 2., 2., 2., 2., 0., 0., 2.,
        1., 2., 0., 0., 2., 2., 1., 2., 0., 0.])
Batch 1:
Inputs (features): tensor([[0.1187, 0.0426, 0.0275,  ..., 0.0454, 0.0701, 0.1422],
        [0.1407, 0.0675, 0.0418,  ..., 0.0393, 0.0652, 0.1397],
        [0.1247, 0.0470, 0.0389,  ..., 0.0442, 0.0681, 0.1390],
        ...,
        [0.1377, 0.0671, 0.0432,  