In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader
import numpy as np
import os
import matplotlib.pyplot as plt
from models import *
from configs import cfg
import pandas as pd
from nltk.translate import bleu_score
import pickle
def load_data(fname):
    # From the csv file given by filename and return a pandas DataFrame of the read csv.
    
    # Define path to data
    # Note: Path relative to KHIEM's files, make changes to relative path if necessary
    dataPath = 'BeerAdvocatePA4/' + fname
    
    # Read csv into pandas frame
    df = pd.read_csv(dataPath)
    
    # Return frame
    return df

def char2oh(padded,translate, beers):
    # Each row has form: beer style index||overall||character indices
    # TODO: Onehot the beer style and concatenate to overall
    # Onehot the character indices and concatenate the beerstyle||overall to each onehotted char
    
    # Each row's beer has form 1x#ofpossiblebeers
    beerstyles = np.zeros((padded.shape[0],len(beers)))
    # Each row's review has form max(lenofsequence) x #ofpossiblecharacters
    # Since we padded each review, they all have the same number of characters
    # Subtract two since we know the first two values aren't characters
    chars = np.zeros((padded.shape[0], (padded.shape[1] - 2), len(translate)))
    
    # First two columns are beerstyle indices and overalls
    bsidx = padded[:,0]
    ovrl = padded[:,1]
    
    # The rest are characters
    ch = padded[:,2:]
    
    # Index with bsidx
    beerstyles[np.arange(padded.shape[0]), bsidx.astype(int)] = 1
    
    igrid = np.mgrid[0:padded.shape[0], 0:(padded.shape[1]-2)]
    # Index with ch, we use meshgrid since this is a 3d array
    chars[igrid[0], igrid[1], ch.astype(int)] = 1
    
    # Concatenate overall and beer style
    meta_data = np.c_[ovrl, beerstyles]
    
    # Tile and reshape meta_data so we have a copy for each one of the characters
    tiled_meta = np.tile(meta_data, padded.shape[1] - 2).reshape(padded.shape[0], (padded.shape[1] - 2), -1)
    
    # Concatenate the items 
    
    # Return both the concatenated and just the one hot
    return np.c_[tiled_meta, chars],chars
    

def process_train_data(data):
    # TODO: Input is a pandas DataFrame and return a numpy array (or a torch Tensor/ Variable)
    # that has all features (including characters in one hot encoded form).

    # Get the dictionary to translate between ASCII and onehot index
    with open("ASCII2oneHot.pkl", "rb") as f:
        translate = pickle.load(f)
    
    # Get the dictionary to translate between beer style and index
    with open("BeerDict.pkl", "rb") as f:
        beers = pickle.load(f)
        
    # List of reviews to onehot after translation
    toOnehot = []
    
    # For each review, convert to list of its translated characters
    # Translated means ord(c) -> onehot index
    # Also translate the beer style to its index value
    # Concatenate all the data and convert to tensor
    for idx,rev in data.iterrows():
        if isinstance(rev['review/text'], str):
            toOnehot.append(torch.Tensor([beers[rev['beer/style']]] + [rev['review/overall']] + 
                                         [translate[ord(x)] for x in list(chr(0)+rev['review/text']+chr(1))]))
    
    # Pad all smaller sentences with 1s to signify <EOS>
    padded = pad_data(toOnehot, translate[1])
    del toOnehot

    # Take the array padded sentences and one-hot the characters.
    # Beer style also gets one-hot
    # Overall does not
    reviews,labels = char2oh(np.array(padded), translate, beers)
    del padded
    
    # Since the labels are simply the next characters, we take all characters except the last one
    # for the review, and everything but the first one for the labels
    return reviews[:,0:-1,:], labels[:,1:,:]
    
    
def train_valid_split(data):
    # TODO: Takes in train data as dataframe and
    # splits it into training and validation data.
    
    # List of indices of the data
    ind = np.arange(len(data))
    
    # Randomize the split
    np.random.shuffle(ind)
    
    # Where to split the indices
    # We'll take first 20% for validation, the rest for training
    split = int(0.2*len(data))
    
    # Split the indices
    vIndices = ind[0:split]
    tIndices = ind[split:]
    
    # Group the indices into their frames then return those
    validation_data = data.iloc[vIndices]
    train_data = data.iloc[tIndices]
    
    return train_data,validation_data
    
def process_test_data(data):
    # TODO: Takes in pandas DataFrame and returns a numpy array (or a torch Tensor/ Variable)
    # that has all input features. Note that test data does not contain any review so you don't
    # have to worry about one hot encoding the data.
    raise NotImplementedError

    
def pad_data(orig_data, pad):
    # TODO: Since you will be training in batches and training sample of each batch may have reviews
    # of varying lengths, you will need to pad your data so that all samples have reviews of length
    # equal to the longest review in a batch. You will pad all the sequences with <EOS> character 
    # representation in one hot encoding.
    # Data comes in as translated ASCII representation, simply sort and call torch pad
    return torch.nn.utils.rnn.pad_sequence(sorted(orig_data, key = lambda x: len(x), reverse=True), 
                                           batch_first=True, padding_value=pad)
    

def getBatchIter(data, batchSize):
    # TODO: Returns a list of batches of indices
    # The list of batch indices will be used to index into the
    # corresponding data frame to extract the data
    
    # List of all possible indices
    ind = np.arange(len(data))
    
    # Calculate how many batches of batchSize would fit into
    # into the length of the data
    numBatches = int(len(data)/batchSize)
    
    # Split the array of indices into roughly equivalent batch sized batches
    batchedInd = np.array_split(ind, numBatches)
    
    return batchedInd
    
    
def train(model, X_train, X_valid, cfg):
    # TODO: Train the model!
    # Datas are given as pandas data frame. Call process on-line as we train to
    # get the data and label
    
    epochs = cfg['epochs']
    l_rate = cfg['learning_rate']
    penalty = cfg['L2_penalty']
    
    
    # Define loss and optimizer
    Criterion = torch.nn.CrossEntropyLoss() # We'll use cross entropy
    Optimizer = optim.Adam(model.parameters(),lr=l_rate, weight_decay=penalty) # Let's use ADAM
    
    # Size of each batch
    batchSize = 64
    
    # Create the batch iterator for the data
    trainIter = getBatchIter(X_train, batchSize)
    validIter = getBatchIter(X_valid, batchSize)
    
    for e in epochs:
        # Training loop
        for batch_count, batchInd in enumerate(trainIter,0):
            # Get the dataframe for the batch
            batchFrame = X_train.iloc[batchInd]

            # Process the batch for data and labels
            batch, labels = process_train_data(batchFrame)

            # TODO: Actual fuckin train lmao

            # Run our batch through the model
            # batch has shape Batchsize x Seqlen x Input Dim
            output = model(batch)

            # Loss n shit and optimize and crap
            # btw if i leave any of this in and you're a TA/tutor
            # mhabad I just forgot. Have a nice day :)

            # Progress bar
            if batch_count % 50 and batch_count > 0:
                print("On batch %d" % batch_count)
                
            
        print("Completed epoch %d" % e)
        
    
    
def generate(model, X_test, cfg):
    # TODO: Given n rows in test data, generate a list of n strings, where each string is the review
    # corresponding to each input row in test data.
    raise NotImplementedError
    
    
def save_to_file(outputs, fname):
    # TODO: Given the list of generated review outputs and output file name, save all these reviews to
    # the file in .txt format.
    raise NotImplementedError
    

In [2]:
m = baselineLSTM(cfg)

In [9]:
out,(h,c) = m(torch.Tensor(a))

In [8]:
X_train = load_data('Beeradvocate_Train.csv')
s = X_train[0:5]
a,b = process_train_data(s)

In [10]:
L = torch.nn.NLLLoss()

In [11]:
b = torch.Tensor(b).type(torch.LongTensor)

In [12]:
x = out.view(5*1146,111)
y = b.argmax(dim=2).view(-1)

torch.Size([5, 1146])