## Get Results
    Now that the data has been cleaned and the model has been validated,
    we trained the model on 90% of of a 500,000 entry sub sample and analyze the results
    
    In order to train model on all of the data, it may be necessary to implement Spark on AWS. 
    Using the entire data set breaks the jupyter kernel

In [1]:
import numpy as np
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import cPickle
from time import time
from sklearn.utils import shuffle
import pyspark
% matplotlib inline

In [2]:
from pyspark.mllib.recommendation import ALS
import math

#### Import Data

In [3]:
# number of nodes in local spark cluster
n_nodes = 3
sc = pyspark.SparkContext(master = "local[{}]".format(n_nodes))

In [48]:
def load_data():
    home = "/Users/Alexander/WINE/data/"
    pseudo_ratings_path = home + "ratings_df.pkl"
    wine_path = home + "wine_df_red_white_only.pkl"

    products_df = pd.read_pickle(wine_path)
    rating_df = pd.read_pickle(pseudo_ratings_path)

    return rating_df, products_df 

In [16]:
def get_rdds():
    def get_df_values_by_col(df):
        cust_tags = df.CustomerHash.values
        wine_productKeys = df.ProductKey.values
        ratings = df.Ratings.values
        return cust_tags, wine_productKeys, ratings

    def create_userHash_userid_pairs(cust_tags):
        # create int:cust_tag key value pairs
        # spark can't read string user ids
        index_to_int = np.arange(0, len(cust_tags) * 100, 100)
        return  index_to_int, [ [tag_int, tag_hash] for tag_hash, tag_int in zip(cust_tags, index_to_int)]
    
    
    def load_data():
        home = "/Users/Alexander/WINE/data/"
        pseudo_ratings_path = home + "ratings_df.pkl"
        wine_path = home + "wine_df_red_white_only.pkl"

        products_df = pd.read_pickle(wine_path)
        rating_df = pd.read_pickle(pseudo_ratings_path)

        return rating_df, products_df 
    
    rating_df, products_df  =  load_data()
    
    # Spark keeps crashing when using full data set
    # Perhaps running Sparking on AWS cluster will help
    rating_df = rating_df.sample(n = 500000)

    # move customerHashs, productKeys,and ratings each to their own list
    cust_tags, wine_productKeys, ratings = get_df_values_by_col(rating_df)

    # generate a numeric index for each customerHash
    index_to_int, cust_tag_bridge = create_userHash_userid_pairs(cust_tags)
    
    # move data from df to list
    complete_data = rating_df.values.tolist()
    # move data to rdd
    complete_data_rdd = sc.parallelize(complete_data)
    
    
    # move customerHash, customer numeric key/value pairs to rdd
    cust_tag_bridge_rdd = sc.parallelize(cust_tag_bridge)
    # formate customer (hash, numeric)
    cust_tag_bridge_rdd = cust_tag_bridge_rdd.map(lambda row: (row[1],row[0]))
    
    # formate data rdd
    complete_data_rdd = complete_data_rdd.map(lambda row: (row[0], (row[1], row[2])) )
    
    
    # join data with customer rdd
    complete_data_id_bridge_rdd = \
    complete_data_rdd.join(cust_tag_bridge_rdd)

    # format -->  (user_id, wine_id, rating)
    clean_data_rdd =\
    complete_data_id_bridge_rdd.map(lambda row: (row[1][1], row[1][0][0], row[1][0][1] )   )

    

    
    # create products_rdd
    products_rdd = sc.parallelize(products_df.values.tolist())
    # format --> (productKey, (productID, Appellation, Varietal, Vinyard) )
    products_rdd = products_rdd.map(lambda row: (row[0], (row[1], row[2], row[3], row[4]) )   )
    

        
    return clean_data_rdd, products_rdd, cust_tag_bridge_rdd

In [17]:
start = time()
clean_data_rdd, products_rdd, cust_tag_bridge_rdd = get_rdds()
end = time()
print "Time Elapsed = {:.3}".format(end - start)

Time Elapsed = 7.27


#### Split Data

In [19]:
# split data into sets in appropriate proprotions, i.e. weights = [6, 2, 2]
data_proportions = [9,1]
training_RDD, test_RDD = clean_data_rdd.randomSplit(data_proportions, seed=0)

In [20]:
# format data so that the key is (user_id, wine_feat)
test_for_predict_RDD = test_RDD.map(lambda x: (x[0], x[1]))

#### Get Predicted Ratings
    Get Predictions for a subset of users 

In [23]:
def get_predicted_ratings(training_RDD, test_for_predict_RDD, test_RDD):
    seed = 5L
    iterations = 10
    regularization_parameter = 0.1
    rank = 12

    model = ALS.train(training_RDD, 
                      rank = rank, 
                      seed=seed, 
                      iterations=iterations,
                      lambda_=regularization_parameter,
                      nonnegative=True)

    # (r[0], r[1]), r[2]) --> user_id, wine_id, rating 
    predictions = model.predictAll(test_for_predict_RDD).map(lambda r: ((r[0], r[1]), r[2]))

    # combine predictions and validation sets
    rates_and_preds = test_RDD.map(lambda r: ((int(r[0]), int(r[1])), float(r[2])))\
                                    .join(predictions)
    # get RMSE for each rank
    error = math.sqrt(rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean())
    
    return model, predictions, error

In [24]:
start = time()
model, predictions, error = get_predicted_ratings(training_RDD, test_for_predict_RDD, test_RDD)
end = time()
print "Time Elapsed = {:.3} mins".format((end - start)/60)

Time Elapsed = 1.42 mins


#### Get user hash tags and wine features 
    Keep productKey for comparison of user's actual wine ratings

In [100]:
def get_userTags_and_wineFeatures(predictions, products_rdd, cust_tag_bridge_rdd):
    # format --> (wine_id, (user_tag, rating)  )
    
    cust_tag_bridge_rdd = cust_tag_bridge_rdd.map(lambda row: (row[1], row[0]))
    
    wineID_userTag_ratings_rdd = \
    predictions.map(lambda row: (row[0][0], (row[0][1], row[1]) ) )\
               .join(cust_tag_bridge_rdd)\
               .map(lambda row: (row[1][0][0],  (row[1][1], row[1][0][1])  ) )

    # format --> (wine_id, appellation, varietal, vineyard, user_tag, rating)
    complete_prediction_data =\
    wineID_userTag_ratings_rdd.join(products_rdd)\
                              .map(lambda row: (row[0], 
                                                row[1][1][1], 
                                                row[1][1][2], 
                                                row[1][1][3], 
                                                row[1][0][0], 
                                                row[1][0][1]  ) ) 
    return complete_prediction_data

In [101]:
complete_prediction_data =  get_userTags_and_wineFeatures(predictions, products_rdd, cust_tag_bridge_rdd)

In [114]:
# format --> (productKey, appellation, varietal, vineyard, user_tag, rating)
complete_prediction_data.take(3)

[(117,
  'Napa Valley',
  'Chardonnay',
  'Merryvale',
  '36A105FB2E9D336055D713C48D9833BA',
  3.657579313539558),
 (117,
  'Napa Valley',
  'Chardonnay',
  'Merryvale',
  'EC93E61EEDB7B34D2B811A19B70DACF2',
  3.918407184735406),
 (117,
  'Napa Valley',
  'Chardonnay',
  'Merryvale',
  'AD097C5FCE9B8398B854599EBF951395',
  3.7539968825897954)]

### Compare Predicted Ratings with True Ratings
    This can only be done for wines that the users have purchased

In [50]:
rating_df, products_df  = load_data()

In [57]:
new_df = rating_df.merge(products_df, how="left", on="ProductKey")

In [113]:
new_df.head()

Unnamed: 0,CustomerHash,ProductKey,Ratings,ProductId,Appellation,Varietal,Vineyard
0,D3BE5BC2AA8796FCD402023CD6E5A6AC,95350,5,97912.0,Russian River,Pinot Noir,DeLoach
1,64F6A2CEC420680A259172BF4CF17CE2,95375,4,107505.0,Russian River,Pinot Noir,Dutton Goldfield
2,6131DC57D6050C3EB674DD8722C83B2B,79964,5,,,,
3,525CD8F0BF94E814F16F4FA947B995B6,102020,5,121010.0,Central Coast,Cabernet Sauvignon,Estancia
4,710DBFAAB43DF7B5E304ED0D26B7966A,129946,3,124678.0,Other Italian,Pinot Gris/Grigio,Stella


In [140]:
test_user = "AD097C5FCE9B8398B854599EBF951395"
test_user_df = new_df[new_df.CustomerHash.isin([test_user])]

In [141]:
product_keys  = test_user_df.ProductKey.values.tolist()

In [142]:
test_user_pred_ratings_rdd = \
complete_prediction_data.filter(lambda row: row[4] ==  test_user)\
                        .filter(lambda row: row[0] in product_keys)

In [143]:
pred_product_ids = test_user_pred_ratings_rdd.distinct().map(lambda row: row[0]).take(5)

The pandas data frame contains data on purchases that have been made by the test user. The print out from the 
test_user_pred_ratings_rdd are the predicted ratings for the same wines that the user has purchased. We can see
that the predicted ratings are very close to the user's actual ratings. 

In [162]:
new_df[new_df.CustomerHash == test_user].sort_values(by="ProductKey").head()

Unnamed: 0,CustomerHash,ProductKey,Ratings,ProductId,Appellation,Varietal,Vineyard
834001,AD097C5FCE9B8398B854599EBF951395,117,4,103977,Napa Valley,Chardonnay,Merryvale
68115,AD097C5FCE9B8398B854599EBF951395,166,4,92313,Other Australia,Other Red Blends,Marquis Philips
958850,AD097C5FCE9B8398B854599EBF951395,166,4,92313,Other Australia,Other Red Blends,Marquis Philips
219889,AD097C5FCE9B8398B854599EBF951395,215,2,93707,Chile,Other Red Blends,Primus
1156324,AD097C5FCE9B8398B854599EBF951395,1087,5,91207,Central Coast,Chardonnay,Bernardus


In [177]:
# format --> (ProductKey, Varietal, Rating)
top_k = 10
test_user_pred_ratings_rdd.map(lambda row: (row[0], (row[2], row[5]))).distinct().takeOrdered(top_k, key=lambda row: row[0])

[(117, ('Chardonnay', 3.7539968825897954)),
 (117, ('Chardonnay', 3.7705673216345224)),
 (166, ('Other Red Blends', 3.834389607136281)),
 (166, ('Other Red Blends', 3.8308637324094796)),
 (166, ('Other Red Blends', 3.844799951590561)),
 (166, ('Other Red Blends', 3.84687994174966)),
 (215, ('Other Red Blends', 1.8613410652567588)),
 (215, ('Other Red Blends', 1.8570075323548823)),
 (215, ('Other Red Blends', 1.8590335103351197)),
 (215, ('Other Red Blends', 1.8543899555036105))]