In [108]:
import pyspark as ps
from pyspark.mllib.recommendation import ALS
from pyspark.sql.types import StructField, StructType, IntegerType
import psycopg2
import os
import pandas as pd
import numpy as np
import datetime

In [3]:
dbname = os.environ['CAPSTONE_DB_DBNAME']
host = os.environ['CAPSTONE_DB_HOST']
username = os.environ['CAPSTONE_DB_USERNAME']
password = os.environ['CAPSTONE_DB_PASSWORD']

In [4]:
conn = psycopg2.connect('dbname={} host={} user={} password={}'.format(dbname, host, username, password))
cursor = conn.cursor()

In [29]:
# query db to get (deck_id, cardstorm_id, card_count) pairs
cursor.execute('''SELECT deck_id, cardstorm_id, card_count
                  FROM decks''')

In [30]:
# create schema for spark decks df
decks_schema = StructType([StructField('deck_id', IntegerType()),
                           StructField('cardstorm_id', IntegerType()),
                           StructField('card_count', IntegerType())])

# create spark df for decks
decks_df = spark.createDataFrame(data=cursor.fetchall(), schema=decks_schema)

In [31]:
# get a list of all unused cardstorm ids
cursor.execute('''SELECT cardstorm_id
                  FROM cards
                  WHERE cardstorm_id NOT IN (SELECT DISTINCT cardstorm_id FROM decks)''')

unused_ids = [_[0] for _ in cursor.fetchall()]

In [32]:
# create fake data for all unused cards
# deck_id of -1 is used to easily identify fake data
unused_cards = []
for unused_id in unused_ids:
    unused_cards.append((-1, unused_id, 1))

In [33]:
# make a new spark df from unused cards
unused_df = spark.createDataFrame(data=unused_cards, schema=decks_schema)

In [34]:
# merge both dataframes into one
complete_decks_df = decks_df.union(unused_df)

In [37]:
# create and train the ALS model
# using implicit train 
# rank is 10 for now, needs to be turned later
model = ALS.trainImplicit(ratings=complete_decks_df, rank=10)

In [38]:
# get the product features matrix out of the model (V)
product_features = model.productFeatures()

In [76]:
# turn the spark RDD into a spark DF, then into a pandas DF
pd_product_features = product_features.toDF().toPandas()

In [77]:
# create a cardstorm_id column, to be used as the index
pd_product_features['cardstorm_id'] = pd_product_features['_1']

# drop column '_1'
pd_product_features.drop(columns='_1', inplace=True)

# set the cardstorm_id to be the index
pd_product_features.set_index(keys='cardstorm_id', inplace=True, verify_integrity=True)

# sort the df by the index
pd_product_features.sort_index(inplace=True)

In [104]:
# expand each row
product_matrix = []
for row in pd_product_features.values:
    product_matrix.append(row[0]) 

product_matrix = np.array(product_matrix)

In [107]:
product_matrix

array([[  7.25691734e-07,  -1.56810677e-06,   2.00104250e-06, ...,
          1.34571025e-07,  -1.13234114e-06,   9.34421109e-08],
       [  7.25691734e-07,  -1.56810677e-06,   2.00104250e-06, ...,
          1.34571025e-07,  -1.13234114e-06,   9.34421109e-08],
       [  7.25691734e-07,  -1.56810677e-06,   2.00104250e-06, ...,
          1.34571025e-07,  -1.13234114e-06,   9.34421109e-08],
       ..., 
       [  7.25691734e-07,  -1.56810677e-06,   2.00104250e-06, ...,
          1.34571025e-07,  -1.13234114e-06,   9.34421109e-08],
       [  7.25691734e-07,  -1.56810677e-06,   2.00104250e-06, ...,
          1.34571025e-07,  -1.13234114e-06,   9.34421109e-08],
       [ -1.46551538e-05,  -2.49797507e-04,   5.84725931e-04, ...,
         -2.21382070e-04,   1.13817063e-04,   3.42667074e-04]])

In [None]:
query = '''CREATE TABLE product_matrices 
          (cardstorm_id int REFERENCES cards(cardstorm_id),
           feature_1 real,
           feature_2 real,
           feature_3 real,
           feature_4 real,
           feature_5 real,
           feature_6 real,
           feature_'''

In [None]:
for row in enumerate(product_matrix, start=1):
    template = ', '.join(['%s'] * len(row))
    query = '''INSERT INTO product_matrices (cardstorm_id, )'''