# Data Pre-Processing for ML Classification

***
# Jupyter Notebook where:
1. CSV containing surface potentials will be imported using pandas
1. pyTorch's Dataloader will be used to iterate through the data in bratches so it can easily be passed into a model 

***
# Import Libraries

In [21]:
import csv
import os
import pandas as pd
import torch
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split

# Read Data from CSV into Dataframe

In [2]:
target_file="v1_vals.csv"

In [5]:
df_v1=pd.read_csv(target_file,header=None)

In [6]:
display(df_v1)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,183,184,185,186,187,188,189,190,191,192
0,0.077473,0.191499,-0.015915,0.135394,0.173085,0.140029,-0.009876,-0.059194,-0.00252,0.077446,...,0.181089,0.021793,0.00649,0.099234,0.101714,0.001482,0.172762,-0.000312,0.014357,0
1,0.354857,-0.096564,0.014989,0.050301,0.228133,0.148119,0.121965,-0.001973,-0.054352,0.062697,...,0.144651,0.107456,0.014562,0.020383,-0.063842,0.136121,0.038906,0.244839,0.105337,0
2,0.113068,0.249683,0.130911,-0.021694,0.113439,-0.037429,0.068267,0.140063,0.027044,0.07412,...,0.131067,0.002131,0.10816,-0.050268,-0.032919,0.027794,-0.055128,0.143659,0.328582,1
3,0.220674,-0.056722,0.046858,0.143011,-0.064925,0.057976,0.126473,0.218015,-0.053549,-0.040245,...,-0.071479,0.177665,0.032123,0.239387,-0.180386,0.113581,0.047421,0.119779,0.353285,1
4,0.188679,0.137335,0.243993,-0.025802,0.140222,0.220769,-0.119227,0.213295,0.173657,-0.061786,...,-0.012182,-0.02404,0.291933,-0.11505,-0.034078,0.250822,0.0002,0.094196,0.097086,2
5,0.078482,-0.01859,0.030242,-0.107222,0.105309,0.079168,0.140757,-0.114055,0.193873,0.00349,...,0.135761,0.024943,-0.025748,0.228745,-0.176834,-0.072627,0.134558,0.120854,0.125967,2


In [27]:
print("Number of DataPoints=",len(df_v1))

Number of DataPoints= 6


In [31]:
for tumor_count in set(df_v1.iloc[:,-1]):
    print("Datapoints for tumor count ",tumor_count,"=",len(df_v1[df_v1.iloc[:,-1]==tumor_count)))

SyntaxError: closing parenthesis ')' does not match opening parenthesis '[' (3891020038.py, line 2)

# Seperate data into Training and Test Data

In [10]:
# all data except last column
X=df_v1.iloc[:,:-1]

# only last column
y=df_v1.iloc[:,-1]

In [11]:
X_train,X_test,y_train,y_test=train_test_split(X,y, test_size=0.2,random_state=42)

In [12]:
print("Training set - X:", X_train.shape, "y:", y_train.shape)
print("Test set - X:", X_test.shape, "y:", y_test.shape)

Training set - X: (4, 192) y: (4,)
Test set - X: (2, 192) y: (2,)


# Create PyTorch Tensor Instances

In [22]:
# Convert DataFrame to PyTorch tensors
X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)

y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32)

In [23]:
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

# Create DataLoader Instances

In [24]:
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

# Viewing the Data in the DataLoader instances

In [26]:
for batch_idx, (inputs, targets) in enumerate(train_loader):
    print(f"Batch {batch_idx}:")
    print("Inputs (features):", inputs)
    print("Targets (labels):", targets)

Batch 0:
Inputs (features): tensor([[ 7.8482e-02, -1.8590e-02,  3.0242e-02, -1.0722e-01,  1.0531e-01,
          7.9168e-02,  1.4076e-01, -1.1406e-01,  1.9387e-01,  3.4904e-03,
          7.8099e-02,  4.7535e-02, -4.3512e-02, -3.0839e-02, -2.4019e-02,
          2.0530e-02,  8.2869e-02,  5.5664e-02,  2.0579e-01,  2.8563e-02,
          1.2001e-01,  1.0202e-01,  2.0822e-01, -5.5786e-02,  4.0781e-02,
          3.2124e-01, -3.2219e-02,  1.0479e-01, -1.0096e-01,  1.9893e-01,
          9.5592e-02,  1.3095e-01,  1.1165e-01, -3.1063e-02,  6.5370e-02,
          1.0884e-01,  2.9180e-01,  3.1322e-01, -6.3239e-02, -6.4932e-02,
         -4.1108e-02,  1.1252e-01, -4.9856e-02, -5.4142e-02,  1.2189e-01,
         -9.1132e-02,  4.8590e-02,  1.7883e-01,  1.9691e-01,  1.4130e-01,
          2.7658e-01,  1.5591e-01,  2.0119e-01,  1.3960e-02,  7.8056e-02,
          1.8977e-01, -4.2753e-02, -1.1835e-01,  1.9448e-01, -1.2736e-01,
          7.3606e-02, -1.0600e-01,  4.7805e-02,  1.3553e-01,  1.0705e-01,
         -