In [1]:
#Import needed Packages from python : 
    
import numpy as np
import pandas as pd
from pyspark.mllib.util import MLUtils
from pyspark.ml.linalg import Vectors
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.mllib.regression import LabeledPoint
from pyspark.context import SparkContext , SparkConf
from pyspark.sql.session import SparkSession
from pyspark.sql import SQLContext


In [2]:
#initialize our spark application 
conf = SparkConf().set("spark.sql.catalogImplementation","hive")
sc = SparkContext('local', 'test', conf=conf)
spark = SparkSession(sc)
sqlContext = SQLContext(sc)


In [4]:
#Import data : 
aisles = spark.read.csv("aisles.csv", header=True, inferSchema=True)
departments = spark.read.csv("departments.csv", header=True, inferSchema=True)
order_products_prior = spark.read.csv("order_products__prior.csv", header=True, inferSchema=True)
order_products_train = spark.read.csv("order_products__train.csv", header=True, inferSchema=True)
orders = spark.read.csv("orders.csv", header=True, inferSchema=True)
products = spark.read.csv("products.csv", header=True, inferSchema=True)

aisles.createOrReplaceTempView("aisles")
departments.createOrReplaceTempView("departments")
order_products_prior.createOrReplaceTempView("order_products_prior")
order_products_train.createOrReplaceTempView("order_products_train")
orders.createOrReplaceTempView("orders")
products.createOrReplaceTempView("products")


In [41]:
#Create SQL table joining all data :
sqlContext.sql('''
create table master_table as
(select op.*,p.product_name,p.aisle_id,p.department_id,d.department from
 (select * from order_products_train 
 union
 select * from order_products_prior) as op
 inner join products as p
 on op.product_id = p.product_id
 inner join departments as d
 on p.department_id = d.department_id)''')

done


In [5]:
# Organize the data by shopping basket
from pyspark.sql.functions import collect_set, col, count
rawData = spark.sql("select p.product_name, o.order_id from products p inner join order_products_train o where o.product_id = p.product_id")
baskets = rawData.groupBy('order_id').agg(collect_set('product_name').alias('items'))
baskets.createOrReplaceTempView('baskets')

In [8]:
#Train the model on the data 
from pyspark.ml.fpm import FPGrowth
fpGrowth = FPGrowth(itemsCol="items", minSupport=0.001, minConfidence=0)
model = fpGrowth.fit(baskets)


In [9]:
#Display frequent itemsets.
model.freqItemsets.show()



+--------------------+-----+
|               items| freq|
+--------------------+-----+
|[Organic Tomato B...|  772|
|[Organic Tomato B...|  175|
|[Organic Tomato B...|  144|
|[Organic Tomato B...|  179|
|[Organic Spinach ...|  475|
|[Whole Milk Ricot...|  347|
| [Medium Salsa Roja]|  275|
|    [Ground Buffalo]|  231|
|       [Tonic Water]|  194|
|[Original Coconut...|  173|
|[Low-Fat Strawber...|  152|
|[Organic SprouTof...|  137|
|            [Banana]|18726|
|[Fruit Punch Spor...|  275|
|[Kitchen Cheese E...|  230|
|[Country White Br...|  194|
|[Soft & Smooth Wh...|  173|
|[Natural Liquid L...|  152|
|[Bag of Organic B...|15480|
|[Organic Large Gr...|  769|
+--------------------+-----+
only showing top 20 rows



In [10]:
#Get frequent itemsets.
mostfreq = model.freqItemsets.rdd.map(lambda r : r["items"]).take(200)



In [11]:
#Merge frequent itemsets 

flat_list = [item for sublist in mostfreq for item in sublist]
res = []
for i in flat_list: 
    if i not in res: 
        res.append(i)
df = pd.DataFrame(res,columns=['product_name'])
df.to_csv("perfectproducts.csv")        


In [12]:
#Display generated association rules.
model.associationRules.show()

+--------------------+--------------------+-------------------+------------------+
|          antecedent|          consequent|         confidence|              lift|
+--------------------+--------------------+-------------------+------------------+
|[Broccoli Crown, ...|            [Banana]| 0.3690773067331671|2.5860442347085395|
|   [Sugar Snap Peas]|[Bag of Organic B...| 0.2207001522070015|1.8706619038067482|
|[Organic Red Onio...|[Bag of Organic B...|0.34673366834170855|2.9389262202485296|
|[Organic Red Onio...|[Organic Baby Spi...|0.22780569514237856|3.0550038280801664|
|[85% Lean Ground ...|            [Banana]|  0.235202492211838|1.6480125921511828|
|[Organic Small Bu...|[Bag of Organic B...|0.36633663366336633| 3.105081612812444|
|[Organic Zucchini...|       [Large Lemon]|0.34615384615384615| 5.583097725875844|
|[Organic Zucchini...|[Organic Baby Spi...|0.29554655870445345|3.9634473038688296|
|[Organic Zucchini...|             [Limes]|0.15368421052631578| 3.342408682073159|
|[Or

In [13]:
#Run an example
query = spark.createDataFrame([
    (1,  ['Organic Tomato Basil Pasta Sauce', 'Medium Navel Orange'])], ["order_id", "items"])
# transform examines the input items against all the association rules and summarize the consequents as prediction
model.transform(query).show()

+--------+--------------------+--------------------+
|order_id|               items|          prediction|
+--------+--------------------+--------------------+
|       1|[Organic Tomato B...|[Bag of Organic B...|
+--------+--------------------+--------------------+



In [14]:
#Test the Example
model.transform(query).rdd.map(lambda r : r["prediction"]).take(1)[0]


['Bag of Organic Bananas',
 'Organic Baby Spinach',
 'Banana',
 'Organic Strawberries',
 'Large Lemon']

In [None]:
#Initizalize Web Application
from flask import Flask, make_response, render_template , jsonify , request  , url_for
import random 
app = Flask(__name__)
order = []
near = dict()
#Recommendation API
@app.route('/market', methods=['POST','GET'])
def product():
    order = []
    related = []
    if request.method == "POST":
        name = request.form.get('name')
        order.append(name)
        orders = [(random.randint(1,100),order)]
        query = spark.createDataFrame(orders, ["order_id", "items"])
        related = model.transform(query).rdd.map(lambda r : r["prediction"]).take(1)[0]
        print(related)
    df = pd.read_csv("goodproducts.csv", usecols=["product_name","link"])
    df = df.dropna()    
    keys = df["product_name"]
    values = df["link"]
    dictAll = dict(zip(keys, values))
    a_subset = {elt: dictAll[elt] for elt in related if elt in dictAll.keys()}
    print(len(a_subset))
    return render_template('index.html', near = a_subset , dictAll = dictAll, order = order)


#ElasticSearchAPI : 
from elasticsearch import Elasticsearch
from elasticsearch_dsl import Search
es = Elasticsearch()
@app.route('/search')
def search():
    tag = request.args.get('tag')
    df = pd.read_csv("goodproducts.csv", usecols=["product_name","link"])
    allnames = df["product_name"]
    allnames = [i for i in allnames if tag in i] 
    print(allnames)
    dictAll = { "" : "" }
    for name in allnames:
        logs_index = "bigdata"
        s = Search(using= es, index=logs_index).query("match", tags=name)
        res = s[0:1].execute()
        for hit in res['hits']['hits']:
            one = hit["_source"]
            link = "http://farm"+one['flickr_farm']+".staticflickr.com/"+one['flickr_server']+"/"+one['id']+"_"+one['flickr_secret']+".jpg"
            dictAll[name] = link
        dictAll.pop("", None)
        #print(dictAll)
    return render_template('index.html', dictAll = dictAll, near= dict(),order = order)


if __name__ == '__main__':
    app.run()

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
   Use a production WSGI server instead.
 * Debug mode: off


 * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)
127.0.0.1 - - [27/Jan/2021 14:06:13] "[37mGET /search?tag=Tomato HTTP/1.1[0m" 200 -


['Organic Tomato Basil Pasta Sauce', 'Organic Tomato Cluster', 'Roma Tomato', 'Red Vine Tomato', 'Organic Diced Tomatoes', 'Organic Grape Tomatoes', 'Organic Roma Tomato']


127.0.0.1 - - [27/Jan/2021 14:06:19] "[37mGET /search?tag=Banana HTTP/1.1[0m" 200 -


['Organic Banana', 'Bag of Organic Bananas', 'Banana']


127.0.0.1 - - [27/Jan/2021 14:06:28] "[37mGET /search?tag=Orange HTTP/1.1[0m" 200 -


['Medium Navel Orange', 'Organic Navel Orange', 'Orange Bell Pepper']
['Organic Tomato Basil Pasta Sauce', 'Organic Reduced Fat 2% Milk', 'Organic Basil', 'Organic Cilantro', 'Organic Roasted Turkey Breast', 'Organic White Onions', 'Organic Carrot Bunch', 'Organic Bell Pepper', 'Organic Banana', 'Organic Tomato Cluster', 'Organic Hass Avocado', 'Organic Large Extra Fancy Fuji Apple', 'Organic Whole String Cheese', 'Organic Gala Apples', 'Bag of Organic Bananas', 'Organic Peeled Whole Baby Carrots', 'Organic Broccoli Florets', 'Organic Kiwi', 'Organic Whole Milk', 'Organic Strawberries', 'Organic Baby Spinach', 'Michigan Organic Kale', 'Organic Italian Parsley Bunch', 'Organic Avocado', 'Organic Yellow Onion', 'Organic Romaine', 'Organic Whole Strawberries', 'Organic Fat Free Milk', 'Organic Extra Firm Tofu', 'Organic Garbanzo Beans', 'Organic Broccoli Crowns', 'Organic Sour Cream', 'Organic Garnet Sweet Potato (Yam)', 'Organic Granny Smith Apple', 'Organic Zucchini', 'Organic Cucumber'

127.0.0.1 - - [27/Jan/2021 14:06:50] "[37mGET /search?tag=Organic HTTP/1.1[0m" 200 -


['Organic Tomato Basil Pasta Sauce', 'Organic Reduced Fat 2% Milk', 'Organic Basil', 'Organic Cilantro', 'Organic Roasted Turkey Breast', 'Organic White Onions', 'Organic Carrot Bunch', 'Organic Bell Pepper', 'Organic Banana', 'Organic Tomato Cluster', 'Organic Hass Avocado', 'Organic Large Extra Fancy Fuji Apple', 'Organic Whole String Cheese', 'Organic Gala Apples', 'Bag of Organic Bananas', 'Organic Peeled Whole Baby Carrots', 'Organic Broccoli Florets', 'Organic Kiwi', 'Organic Whole Milk', 'Organic Strawberries', 'Organic Baby Spinach', 'Michigan Organic Kale', 'Organic Italian Parsley Bunch', 'Organic Avocado', 'Organic Yellow Onion', 'Organic Romaine', 'Organic Whole Strawberries', 'Organic Fat Free Milk', 'Organic Extra Firm Tofu', 'Organic Garbanzo Beans', 'Organic Broccoli Crowns', 'Organic Sour Cream', 'Organic Garnet Sweet Potato (Yam)', 'Organic Granny Smith Apple', 'Organic Zucchini', 'Organic Cucumber', 'Organic Lacinato (Dinosaur) Kale', 'Organic Spring Mix', 'Organic B

127.0.0.1 - - [27/Jan/2021 14:07:07] "[37mGET /search?tag=Organic HTTP/1.1[0m" 200 -
127.0.0.1 - - [27/Jan/2021 14:18:36] "[37mGET /market HTTP/1.1[0m" 200 -


0


127.0.0.1 - - [27/Jan/2021 14:18:50] "[37mPOST /market HTTP/1.1[0m" 200 -


['Organic Cilantro', 'Limes', 'Asparagus', 'Organic Whole Milk', 'Organic Strawberries', 'Organic Avocado', 'Organic Yellow Onion', 'Organic Zucchini', 'Large Lemon', 'Seedless Red Grapes', 'Strawberries', 'Organic Blueberries', 'Bag of Organic Bananas', 'Organic Red Onion', 'Organic Grape Tomatoes', 'Organic Raspberries', 'Organic Cucumber', 'Organic Baby Spinach', 'Organic Baby Carrots', 'Organic Garlic', 'Original Hummus', 'Organic Lemon', 'Organic Hass Avocado', 'Banana', 'Organic Small Bunch Celery', 'Organic Garnet Sweet Potato (Yam)', 'Organic Tomato Cluster', 'Organic Peeled Whole Baby Carrots', 'Organic Red Bell Pepper', 'Organic Granny Smith Apple', 'Organic Whole String Cheese', 'Organic Navel Orange', "Organic D'Anjou Pears", 'Organic Kiwi']
34


127.0.0.1 - - [27/Jan/2021 14:18:51] "[37mGET /market HTTP/1.1[0m" 200 -


0


127.0.0.1 - - [27/Jan/2021 14:18:54] "[37mGET /market HTTP/1.1[0m" 200 -


0


127.0.0.1 - - [27/Jan/2021 14:19:01] "[37mPOST /market HTTP/1.1[0m" 200 -


['Limes', 'Asparagus', 'Organic Whole Milk', 'Yellow Onions', 'Organic Strawberries', 'Organic Avocado', 'Organic Yellow Onion', 'Large Lemon', 'Organic Zucchini', 'Honeycrisp Apple', 'Strawberries', 'Seedless Red Grapes', 'Bag of Organic Bananas', 'Organic Blueberries', 'Organic Red Onion', 'Organic Raspberries', 'Organic Grape Tomatoes', 'Organic Baby Carrots', 'Organic Cucumber', 'Organic Baby Spinach', 'Organic Garlic', 'Organic Hass Avocado', 'Banana', 'Organic Lemon', 'Small Hass Avocado', 'Broccoli Crown', 'Organic Baby Arugula', 'Red Peppers', 'Organic Large Extra Fancy Fuji Apple', 'Organic Gala Apples', 'Fresh Cauliflower', 'Michigan Organic Kale', 'Organic Small Bunch Celery', 'Organic Garnet Sweet Potato (Yam)', 'Organic Tomato Cluster', 'Green Bell Pepper', 'Carrots', 'Organic Peeled Whole Baby Carrots', 'Organic Italian Parsley Bunch', 'Organic Red Bell Pepper', 'Organic Granny Smith Apple', 'Red Vine Tomato', 'Unsweetened Almondmilk', 'Organic Ginger Root', 'Garlic', 'Ja

127.0.0.1 - - [27/Jan/2021 14:19:12] "[37mGET /market HTTP/1.1[0m" 200 -


0
