<div style="text-align: center;">
  <h1>Project Big Data Management</h1>
  <h3>Amazon Recommendation System Using Machine Learning</h3>
  <h3>Nhu Minh Quang Nguyen</h3>
  <h3>Duc Long Nguyen</h3>
</div>


### Import modules

In [1]:
import pandas as pd
import matplotlib.pyplot as plt


dataset = pd.read_csv("amazon-groceries.csv")
  

### Dataset 

In [2]:
dataset.head(10)

Unnamed: 0,Member_number,Date,itemDescription
0,1808,21-07-2015,tropical fruit
1,2552,05-01-2015,whole milk
2,2300,19-09-2015,pip fruit
3,1187,12-12-2015,other vegetables
4,3037,01-02-2015,whole milk
5,4941,14-02-2015,rolls/buns
6,4501,08-05-2015,other vegetables
7,3803,23-12-2015,pot plants
8,2762,20-03-2015,whole milk
9,4119,12-02-2015,tropical fruit


### Dataset overview

In [3]:
dataset.shape

(38765, 3)

In [4]:
dataset.describe()

Unnamed: 0,Member_number
count,38765.0
mean,3003.641868
std,1153.611031
min,1000.0
25%,2002.0
50%,3005.0
75%,4007.0
max,5000.0


### Sort dataset to find find which member buy together

In [19]:
dataset.sort_values(by=["Member_number"]).head(10)

Unnamed: 0,Member_number,Date,itemDescription
1629,1000,27-05-2015,soda
13331,1000,24-06-2014,whole milk
8395,1000,15-03-2015,whole milk
4843,1000,15-03-2015,sausage
17778,1000,27-05-2015,pickled vegetables
2047,1000,24-07-2015,canned beer
24544,1000,15-03-2015,yogurt
18196,1000,24-07-2015,misc. beverages
32851,1000,24-06-2014,salty snack
6388,1000,25-11-2015,sausage


In [5]:
sorted_dataset = dataset.sort_values(by=["Member_number", "Date"])

In [6]:
sorted_dataset.head(10)

Unnamed: 0,Member_number,Date,itemDescription
4843,1000,15-03-2015,sausage
8395,1000,15-03-2015,whole milk
20992,1000,15-03-2015,semi-finished bread
24544,1000,15-03-2015,yogurt
13331,1000,24-06-2014,whole milk
29480,1000,24-06-2014,pastry
32851,1000,24-06-2014,salty snack
2047,1000,24-07-2015,canned beer
18196,1000,24-07-2015,misc. beverages
6388,1000,25-11-2015,sausage


### Group items that bought together by: a member buy multiple products on same date

In [7]:
grouped_data = sorted_dataset.groupby(["Member_number", "Date"])['itemDescription'].apply(list).reset_index()
grouped_data = grouped_data.rename(columns={"itemDescription": "purchase"})


### Now we have a dataset for items that are purchased together

In [22]:
grouped_data.head(11)

Unnamed: 0,Member_number,Date,purchase
0,1000,15-03-2015,"[sausage, whole milk, semi-finished bread, yog..."
1,1000,24-06-2014,"[whole milk, pastry, salty snack]"
2,1000,24-07-2015,"[canned beer, misc. beverages]"
3,1000,25-11-2015,"[sausage, hygiene articles]"
4,1000,27-05-2015,"[soda, pickled vegetables]"
5,1001,02-05-2015,"[frankfurter, curd]"
6,1001,07-02-2014,"[sausage, whole milk, rolls/buns]"
7,1001,12-12-2014,"[whole milk, soda]"
8,1001,14-04-2015,"[beef, white bread]"
9,1001,20-01-2015,"[frankfurter, soda, whipped/sour cream]"


## Get purchase column

In [9]:
purchases = list(grouped_data.purchase)
purchases[-10:-1]

[['rolls/buns', 'curd'],
 ['semi-finished bread', 'newspapers'],
 ['butter milk', 'whipped/sour cream'],
 ['berries', 'onions'],
 ['other vegetables', 'detergent'],
 ['tropical fruit',
  'berries',
  'other vegetables',
  'yogurt',
  'kitchen towels',
  'napkins'],
 ['bottled water', 'herbs'],
 ['fruit/vegetable juice', 'onions'],
 ['soda', 'root vegetables', 'semi-finished bread']]

In [10]:
from collections import defaultdict
from itertools import combinations

class Apriori:

  def __init__(self):
    self.supports = None
    self.pair_supports = None

  def fit(self, item_lists: list[list[str]]):
    single_support = defaultdict(int)
    pair_support = defaultdict(int)
    triple_support = defaultdict(int)
    
    for cart in item_lists:
      # Calculate single item support
      for item in cart:
        single_support[item] += 1
      
      # Calculate pair support
      for pair in combinations(cart, 2):
        pair = tuple(sorted(pair))  # Ensure pairs are in a consistent order
        pair_support[pair] += 1
      
      for triple in combinations(cart, 3):
        try:
          tripple = tuple(sorted(pair))
          triple_support[triple] += 1
        except:
          pass 
        
    
    # Convert defaultdicts to regular dictionaries
    self.supports = dict(single_support)
    self.pair_supports = dict(pair_support)
    self.tripple_supports = dict(triple_support)
  
  def confidence(self, a, b):
    pair  = tuple(sorted([a, b]))
    assert pair in self.pair_supports, "Can't find"
    return self.pair_supports[pair] /self.supports[b]
  
  def lift(self, a, b):
    return self.confidence(a, b) / self.supports[b]
  
  def recommend(self, a:str):
    
    tops = [(0, ""), (0, "")]  # List of tuples (confidence, item)

    a_alike = [] 
    for b in self.supports.keys():
      if b == a:
        continue  # Skip the same item
      if a.lower() in b.lower():
        a_alike.append(b)
      try:
        conf = self.confidence(a, b)
        # Insert in the correct position to keep the list sorted by confidence
        if conf > tops[0][0]:
            tops = [(conf, b), tops[0]]
        elif conf > tops[1][0]:
            tops[1] = (conf, b)
      except:
        pass
    
    if tops[0][0] != 0:
      for conf, item in tops:
        print(f"Item: {item}, Confidence: {conf * 100:.2f}%")
    if tops[0][0] == 0:
      message = ""
      if a_alike:
        message = f"Apri: {a} is not in item list, did you mean: {', '.join(a_alike)}"
      else:
        message = f"Currently we don't have {a}, you can try our other products: whole milk, chocolate, rolls/bun"
      return message
      
    
    return [item for _, item in tops]

In [11]:
apriori = Apriori()

# train
apriori.fit(purchases)


### Save pretrained model for later usage in personal apps

In [12]:
# uncomment the code below to save pretrained model
# import dill as pickle
 
# pickle.dump(apriori, open("models/apriori-beta.pkl", 'wb'))

### Make predictions

In [13]:
apriori.recommend("whole milk")

Item: rubbing alcohol, Confidence: 40.00%
Item: brandy, Confidence: 39.47%


['rubbing alcohol', 'brandy']

In [14]:
apriori.recommend("rubbing alcohol")

Item: sparkling wine, Confidence: 2.17%
Item: butter milk, Confidence: 0.38%


['sparkling wine', 'butter milk']

In [15]:
apriori.recommend("soda")

Item: bags, Confidence: 25.00%
Item: artif. sweetener, Confidence: 24.14%


['bags', 'artif. sweetener']

In [16]:
apriori.recommend("bottled beer")

Item: fish, Confidence: 10.34%
Item: nut snack, Confidence: 9.09%


['fish', 'nut snack']

In [17]:
apriori.recommend("root vegetables")

Item: whisky, Confidence: 25.00%
Item: decalcifier, Confidence: 22.22%


['whisky', 'decalcifier']

In [23]:
apriori.recommend("frozen vegetables")

Item: salad dressing, Confidence: 16.67%
Item: cookware, Confidence: 11.76%


['salad dressing', 'cookware']