# This exercise was performed on ARCC Beartooth with the following environment/software packages: 

# Load Preinstalled Modules
module load gcc12.2.0 miniconda3 git/2.37.0

# Create environment then activate with:
conda env create -f whatev_ml_env.yml
conda activate whatev_ml_env

# Make sure jupyter can see the kernel in your conda env so you can select from southpass interface dropdown:
python -m ipykernel install --user --name=whatev_conda_torch

In [2]:
#importing all necessary libraries
import torch as torch
import os,sys,platform
import torchvision as tv
import math
import copy
import numpy as np
import pandas as pd
import tqdm #A progress bar
import matplotlib.pyplot as plt

from torch.utils.data import DataLoader, Dataset
from torchvision.transforms import ToTensor
from torch.autograd import Variable
from torch.nn import functional as F

from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.model_selection import train_test_split, ValidationCurveDisplay, GridSearchCV
from sklearn.metrics import r2_score as r2
from sklearn.metrics import accuracy_score as accuracy
from sklearn.metrics import confusion_matrix as cm
from sklearn.metrics import ConfusionMatrixDisplay as showMatrix
from sklearn import svm
from sklearn.svm import SVC

In [None]:
#checking devices available

available_gpus = [torch.cuda.device(i) for i in range(torch.cuda.device_count())]
# Pretty-print the names
for i in available_gpus:
    print(torch.cuda.get_device_properties(i).name)

# Prints 'Tesla V100-SXM2-16GB', for example
print(torch.version.cuda)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Using device: ", device)

In [None]:
#confirm and checking platform and versions
print(f"Python Platform:{platform.platform()}")
print(f"Python {sys.version}")
print(f"PyTorch Version: ", torch.__version__)
print(f"Pandas Version: " , pd.__version__)

In [None]:
#Import data and separate out
data = pd.read_csv('winequality-red.csv',sep=';')

#view data, get info about it, clean if necessary
data.info()
data.tail(10)


In [None]:
#data looks good, export and separate out labels
labels = data.pop("quality")
display(labels) #should be a list of int64s

In [None]:
#Make a method that converts pandas dataframe to a pytorch tensor
def to_tensor(df): return torch.tensor(df.values.astype(np.float32))

In [None]:
#AKA'ing our data into X features and y labels
X,y = data,labels
#Uses train_test_split for model evals and use to shuffle our data and randomize what gets parsed to test and train
X_train_raw, X_test_raw, y_train, y_test = train_test_split(X, y, train_size=.75, shuffle=True)

#Standardizing our feature data vals to fall between 0 and 1
scaler = StandardScaler()
scaler.fit(X_train_raw)
X_train = scaler.transform(X_train_raw)
X_test = scaler.transform(X_test_raw)

#Convert to 2D PyTorch Tensors now
X_train=torch.tensor(X_train, dtype=torch.float32)
y_train=to_tensor(y_train).reshape(-1,1)
X_test=torch.tensor(X_test, dtype=torch.float32)
y_test=to_tensor(y_test).reshape(-1,1)

plot = labels.plot(kind='hist', title="Data Histogram", figsize=(5,3))

In [None]:
#define first iteration, model1 linear regression model initializer from pytorch's premade, prelearned nn linearRegression library.  Lazy - Booyah.  12 features go in, 1 label comes out.
model1 = torch.nn.Linear(in_features=11,out_features=1) 


In [None]:
#Making my own linear regression model2 from nn.Module from pytorch
class myLinReg(torch.nn.Module):
    #Constructor for initiating
    def __init__(self, inputSz, outputSz):
        super().__init__()
        self.linear = torch.nn.Linear(inputSz, outputSz)
    #Define the forward function for predicting:
    def forward(self, x):
        y_pred = self.linear(x)
        return y_pred

In [None]:
#Define some common training parameters I want to use for my Linear Regression model: model2
n_epochs2 = 300
batch_size=25
learnRate=0.005
#iterate one batch to next
batch_start=torch.arange(0, len(X_train), batch_size)

#hold my best model to use later
best_mse2 = np.inf
best_weights2 = None
history2=[]


#initialize my model!
myLRmodel=myLinReg(11,1)

#define criteria for the Loss Function with MSE (Mean Squared Error) to evaluate loss.
lossfn2 = torch.nn.MSELoss()

#define my optimizer function using optimizer from Stochastic Gradient Descent
optimizer2 = torch.optim.SGD(myLRmodel.parameters(), learnRate)


#Use GPU if we got it
if device=="cuda":
    myLRmodel.cuda()

#Training my model - I'm honestly not sure if I need the if cuda/not or if that's 
#covered by what I just did with myRLmodel.cuda(). I usually just put it in in case.

#loop through training
for epoch in range (n_epochs2):
    myLRmodel.train()
    #Change inputs and labels to variables 
    if device=="cuda":
        inputs = Variable(X_train).cuda()
        labels = Variable(y_train).cuda()
    else:
        inputs = Variable(X_train)
        labels = Variable(y_train)

    with tqdm.tqdm(batch_start, unit="batch", mininterval=0, disable=True) as bar:
        bar.set_description(f"Epoch {epoch}")
        for start in bar:
        #takes a batch
            X_batch = inputs[start:start+batch_size]
            y_batch = labels[start:start+batch_size]

            #forward pass
            train_pred=myLRmodel(X_batch)
            loss = lossfn2(train_pred, y_batch)

            #backwards pass clearing optimizer gradient buffers so they don't accumulate
            optimizer2.zero_grad()
            loss.backward()

            #update weights
            optimizer2.step()
            
            #print my progress (when in CLI)  Nothing to see here, folks.
            bar.set_postfix(mse=float(loss))
        
    #evaluate accuracy at end of each epoch against our test data
    myLRmodel.eval()
    test_pred = myLRmodel(X_test)
    mse2 = lossfn2(test_pred, y_test)

    #convert loss to a float val
    mse2=float(mse2)

    #save historic mse vals for comparison
    history2.append(mse2)
    if mse2 < best_mse2:
        best_mse2=mse2
        best_weights2 = myLRmodel.state_dict().copy()    
            


In [None]:
#Predict X based on model1 from torch's prepacked LinearRegression model that's been scaled for between 0 and 1 to match mine.
yPred1=model1(X_test)

#print("Predictions for X using PyTorch LR Model: ", yPred1)

In [None]:
#bring back my best stored model and return best accuracy
myLRmodel.load_state_dict(best_weights2)                         

In [None]:
#clear any previous plotting
plt.clf()

#Going to Plot the MSEs through each epoch
print("MSE: ", "%.2f" % best_mse2)

In [None]:
plt.plot(history2)
plt.show()

In [None]:
#Scale train data and model predictions 
if device=="cuda":
    train_pred = myLRmodel(Variable(X_train).cuda())
else:
    train_pred = myLRmodel(Variable(X_train))

traintrue = y_train.squeeze().detach().numpy()
trainpred = train_pred.squeeze().detach().numpy()

print(len(trainpred))
print(len(traintrue))


train_error = r2(traintrue, trainpred, multioutput='raw_values')
print("train metrics: " , train_error)

plt.clf()
plt.figure(figsize=(20,4))
plt.plot(traintrue, label="actual")
plt.plot(trainpred, label="predicted")
plt.ylabel("output")
plt.ylim(0,10)
plt.legend()
plt.show()


In [None]:
#do with test data now
if device=="cuda":
    test_pred = myLRmodel(Variable(X_test).cuda())
else:
    test_pred = myLRmodel(Variable(X_test))

#print(train_pred)
testtrue = y_test.squeeze().detach().numpy()
testpred = test_pred.squeeze().detach().numpy()

#print(testpred)

test_error = r2(testtrue, testpred, multioutput='variance_weighted')
print("test metrics: ", test_error)
    


plt.figure(figsize=(10,4))
plt.plot(testtrue, label="actual")
plt.plot(testpred, label="predicted")
plt.ylabel("output")
plt.ylim(0,10)
plt.legend()
plt.show()
print(torch.seed())

# This model is a sh__ show but with standard LR the way I'm doing it probably expected.

# Lets do a VERY simple classification in scikitlearn:

In [None]:
#Reget our data since we messed with it so much up above.
#Import data and separate out
data = pd.read_csv('winequality-red.csv',sep=';')
labels = data.pop("quality")

#AKA'ing our data into X features and y labels
#Uses train_test_split for model evals and use to shuffle our data and randomize what gets parsed to test and train
X_train_raw, X_test_raw, y_train, y_test = train_test_split(data, labels, train_size=.75, shuffle=True,random_state=0)


In [None]:
classify = svm.SVC(kernel="linear", gamma='auto', C=5)
classify.fit(X_train_raw, y_train)
predictions = classify.predict(X_test_raw)

print(predictions)
flat_y = y_test.to_numpy()
print(flat_y)
print(classify.score(X_train, y_train))

wineClassAccuracy = accuracy(y_test, predictions)
print(wineClassAccuracy)


wineClassConfMatrix = cm(y_test, predictions, labels=[1,2,3,4,5,6,7,8])
disp=showMatrix(wineClassConfMatrix)
disp.plot()
plt.show()

In [None]:
plt.figure(figsize=(20,4))
plt.plot(predictions, label = "predicted", linestyle="")
plt.plot(flat_y, label = "actual", linestyle="")
plt.show()

In [None]:
#keep a chart of possible hyperparameters for SVM
param={
    'C':[.01, .1, 1, 10],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'degree': [1,3,5,7,9],
    'gamma': [0.01,0.1,1,10,100]
}

#make a quick and dirty helper to help us make a choice in terms of parameters 
#for our SVM with Scikitlearn tools
def get_best_params(params):
    svm  = SVC ();
    svm_cv = GridSearchCV(svm, params, cv=3)
    svm_cv.fit(X_train_raw, y_train)
    print("Best Parameters are: " , svm_cv.best_params_)
    print("Best Training Score is: " , svm_cv.best_score_)
    print("Best Testing Score is: ", svm_cv.score(X_test_raw, y_test))

In [None]:
#Run the function 
get_best_params(param)


#This should print something out... not sure why it isn't.