In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader
import numpy as np
import os
import matplotlib.pyplot as plt
from models import *
from configs import cfg
import pandas as pd
from nltk.translate import bleu_score
import pickle

In [2]:
def load_data(fname):
    # From the csv file given by filename and return a pandas DataFrame of the read csv.
    
    # Define path to data
    # Note: Path relative to KHIEM's files, make changes to relative path if necessary
    dataPath = 'BeerAdvocatePA4/' + fname
    
    # Read csv into pandas frame
    df = pd.read_csv(dataPath)
    
    # Return frame
    return df

def char2oh(padded,translate, beers):
    # Each row has form: beer style index||overall||character indices
    # TODO: Onehot the beer style and concatenate to overall
    # Onehot the character indices and concatenate the beerstyle||overall to each onehotted char
    
    # Each row's beer has form 1x#ofpossiblebeers
    beerstyles = np.zeros((padded.shape[0],len(beers)))
    # Each row's review has form max(lenofsequence) x #ofpossiblecharacters
    # Since we padded each review, they all have the same number of characters
    # Subtract two since we know the first two values aren't characters
    chars = np.zeros((padded.shape[0], (padded.shape[1] - 2), len(translate)))
    
    # First two columns are beerstyle indices and overalls
    bsidx = padded[:,0]
    ovrl = padded[:,1]
    
    # The rest are characters
    ch = padded[:,2:]
    
    # Index with bsidx
    beerstyles[np.arange(padded.shape[0]), bsidx.astype(int)] = 1
    
    igrid = np.mgrid[0:padded.shape[0], 0:(padded.shape[1]-2)]
    # Index with ch, we use meshgrid since this is a 3d array
    chars[igrid[0], igrid[1], ch.astype(int)] = 1
    
    # Concatenate overall and beer style
    meta_data = np.c_[ovrl, beerstyles]
    
    # Tile and reshape meta_data so we have a copy for each one of the characters
    tiled_meta = np.tile(meta_data, padded.shape[1] - 2).reshape(padded.shape[0], (padded.shape[1] - 2), -1)
    
    # Concatenate the items 
    return np.c_[tiled_meta, chars]
    

def process_train_data(data):
    # TODO: Input is a pandas DataFrame and return a numpy array (or a torch Tensor/ Variable)
    # that has all features (including characters in one hot encoded form).

    # Get the dictionary to translate between ASCII and onehot index
    with open("ASCII2oneHot.pkl", "rb") as f:
        translate = pickle.load(f)
    
    # Get the dictionary to translate between beer style and index
    with open("BeerDict.pkl", "rb") as f:
        beers = pickle.load(f)
        
    # List of reviews to onehot after translation
    toOnehot = []
    
    # For each review, convert to list of its translated characters
    # Translated means ord(c) -> onehot index
    # Also translate the beer style to its index value
    # Concatenate all the data and convert to tensor
    for idx,rev in data.iterrows():
        toOnehot.append(torch.Tensor([beers[rev['beer/style']]] + [rev['review/overall']] + 
                                     [translate[ord(x)] for x in list(chr(0)+rev['review/text']+chr(1))]))
    
    # Pad all smaller sentences with 1s to signify <EOS>
    padded = torch.nn.utils.rnn.pad_sequence(sorted(toOnehot, key = lambda x: len(x), reverse=True), 
                                           batch_first=True, padding_value=translate[1])

    # Take the array padded sentences and one-hot the characters.
    # Beer style also gets one-hot
    # Overall does not
    
    # Parse out labels
    padded = char2oh(np.array(padded), translate, beers)
    
    return padded
    
    
def train_valid_split(data):
    # TODO: Takes in train data as dataframe and
    # splits it into training and validation data.
    
    # List of indices of the data
    ind = np.arange(len(data))
    
    # Randomize the split
    np.random.shuffle(ind)
    
    # Where to split the indices
    # We'll take first 20% for validation, the rest for training
    split = int(0.2*len(data))
    
    # Split the indices
    vIndices = ind[0:split]
    tIndices = ind[split:]
    
    # Group the indices into their frames then return those
    validation_data = data.iloc[vIndices]
    train_data = data.iloc[tIndices]
    
    return train_data,validation_data
    
def process_test_data(data):
    # TODO: Takes in pandas DataFrame and returns a numpy array (or a torch Tensor/ Variable)
    # that has all input features. Note that test data does not contain any review so you don't
    # have to worry about one hot encoding the data.
    raise NotImplementedError

    
def pad_data(orig_data):
    # TODO: Since you will be training in batches and training sample of each batch may have reviews
    # of varying lengths, you will need to pad your data so that all samples have reviews of length
    # equal to the longest review in a batch. You will pad all the sequences with <EOS> character 
    # representation in one hot encoding.
    raise NotImplementedError
    

def getBatchIter(data, batchSize):
    # TODO: Returns a list of batches of indices
    # The list of batch indices will be used to index into the
    # corresponding data frame to extract the data
    
    # List of all possible indices
    ind = np.arange(len(data))
    
    # Calculate how many batches of batchSize would fit into
    # into the length of the data
    numBatches = int(len(data)/batchSize)
    
    # Split the array of indices into roughly equivalent batch sized batches
    batchedInd = np.array_split(ind, numBatches)
    
    return batchedInd
    
    
def train(model, X_train, X_valid, cfg):
    # TODO: Train the model!
    # Datas are given as pandas data frame. Call process on-line as we train to
    # get the data and label
    
    # Size of each batch
    batchSize = 64
    
    # Create the batch iterator for the data
    trainIter = getBatchIter(X_train, batchSize)
    validIter = getBatchIter(X_valid, batchSize)
    
    # Training loop
    for batch_count, batchInd in enumerate(trainIter,0):
        # Get the dataframe for the batch
        batchFrame = X_train.iloc[batchInd]
        
        # Process the batch for data and labels
        corpus, labels = process_train_data(batchFrame)
        
        
    
    
def generate(model, X_test, cfg):
    # TODO: Given n rows in test data, generate a list of n strings, where each string is the review
    # corresponding to each input row in test data.
    raise NotImplementedError
    
    
def save_to_file(outputs, fname):
    # TODO: Given the list of generated review outputs and output file name, save all these reviews to
    # the file in .txt format.
    raise NotImplementedError
    

In [3]:
if __name__ == "__main__":
    train_data_fname = "Beeradvocate_Train.csv"
    test_data_fname = "Beeradvocate_Test.csv"
    out_fname = "model_outputs.out"
    
    train_data = load_data(train_data_fname) # Generating the pandas DataFrame
    test_data = load_data(test_data_fname) # Generating the pandas DataFrame
    X_train, X_valid = train_valid_split(train_data) # Splitting the train data into train-valid data
    
    model = baselineLSTM(cfg) # Replace this with model = <your model name>(cfg)
    if cfg['cuda']:
        computing_device = torch.device("cuda")
    else:
        computing_device = torch.device("cpu")
    model.to(computing_device)
    
    train(model, X_train, X_valid, cfg) # Train the model
    outputs = generate(model, X_test, cfg) # Generate the outputs for test data
    save_to_file(outputs, out_fname) # Save the generated outputs to a file

ValueError: too many values to unpack (expected 2)

In [88]:
   train_data, train_labels = process_train_data(train_data) # Converting DataFrame to numpy array
    X_test = process_test_data(test_data) # Converting DataFrame to numpy array

Unnamed: 0.1,Unnamed: 0,beer/name,beer/beerId,beer/brewerId,beer/ABV,beer/style,review/appearance,review/aroma,review/palate,review/taste,review/overall,review/time,review/profileName,review/text
855192,855192,All Others Pale,41009,13307,6.0,American Pale Ale (APA),4.5,4.0,4.0,4.0,4.0,1238360333,mikesgroove,My first beer in nearly two weeks as I have be...
865928,865928,Mamma Mia! Pizza Beer,41127,16836,4.6,Herbed / Spiced Beer,3.0,1.5,2.5,1.5,1.5,1224865806,nightcrawler,Bought this to give it a try. What the heck.\t...
872855,872855,Stone 07.07.07 Vertical Epic Ale,37326,147,8.4,Belgian Strong Pale Ale,4.0,4.0,3.5,4.0,4.0,1184961073,dblinkhorn,Poured from a 22oz bomber into a tulip glass.\...
62170,62170,100% Barrel Fermented Autumn Maple,61151,16866,10.0,Fruit / Vegetable Beer,4.0,4.0,4.0,4.0,4.0,1305407811,black13,Thanks to Vitese for sending me this bottle. I...
159341,159341,IPA (India Pale Ale),9088,3818,7.3,American IPA,3.5,4.0,4.0,3.5,4.5,1275452010,jzeilinger,"A - Pours a clear orange color, creamy white h..."
671221,671221,Cascazilla,18721,651,7.0,American Amber / Red Ale,4.0,3.5,4.0,4.0,4.0,1292727882,coalcracker,Appearance: Pours a truly amber color with sub...
422916,422916,Great Lakes Edmund Fitzgerald Porter,226,73,5.8,American Porter,4.0,4.5,4.0,4.5,4.0,1252161432,mhatters,Served on-tap at TJs in Paoli\t\tPours a deep ...
1136812,1136812,Pliny The Younger,21690,863,11.0,American Double / Imperial IPA,4.0,4.5,5.0,5.0,5.0,1132707447,kmpitz2,Pours a hazy peach color with a 2 finger white...
1139348,1139348,Aud Blonde,27057,863,4.5,American Blonde Ale,3.5,4.0,4.0,3.5,4.0,1298097672,BerkeleyBeerMan,Drank this on tap at the Russian river brewing...
848207,848207,Naughty Hildegard ESB,56157,18858,6.5,Extra Special / Strong Bitter (ESB),3.5,3.5,3.5,2.5,2.0,1309057236,northyorksammy,Fresh bottle from rutager. Pours tan brown bod...


In [3]:
n = pd.read_csv('BeerAdvocatePA4/Beeradvocate_Train.csv', chunksize = 1000)

In [None]:
for idx,chnk in enumerate(n,0):
    print(idx)
    res = process_train_data(chnk)
    with open('Processed/chnk'+str(idx)+'.prc', 'wb+') as f:
        pickle.dump(res,f, pickle.HIGHEST_PROTOCOL)
    del res

0
1
2
