# 02807 Final project: Recommendation system
Recommendation system of products from __Digital Music__ category on __Amazon__. Products are suggested based on a short description inserted by a user.
[**Data source**](https://cseweb.ucsd.edu/~jmcauley/datasets/amazon_v2/)

## Data processing

In [None]:
# Imports
import json
import gzip
import spacy
import warnings
import os
import pandas as pd
import numpy as np
import torch
from collections import Counter, defaultdict
from lxml import html, etree
from nrclex import NRCLex
from transformers import AutoTokenizer, AutoModelWithLMHead

In [None]:
# Download dataset if it is not downloaded yet
if not os.path.exists('Dataset/meta_Digital_Music.json.gz'):
    !wget https://datarepo.eng.ucsd.edu/mcauley_group/data/amazon_v2/metaFiles2/meta_Digital_Music.json.gz -P ./Dataset
else:
    print('Dataset already downloaded.')

__Data format__
   * `asin`: ID of the product, e.g. 0000031852
   * `title`: name of the product
   * `feature`: bullet-point format features of the product
   * `description`: description of the product
   * `price`: price in US dollars (at time of crawl)
   * `imageURL`: url of the product image
   * `imageURL`: url of the high resolution product image
   * `related`: related products (also bought, also viewed, bought together, buy after viewing)
   * `salesRank`: sales rank information
   * `brand`: brand name
   * `categories`: list of categories the product belongs to
   * `tech1`: the first technical detail table of the product
   * `tech2`: the second technical detail table of the product
   * `similar`: similar product table

_Note that there are usually multiple attributes left out blank for each product (specific attributes differs from product to product)._ 


In [None]:
# Data is in the format: 
# "overall": 4.0,
# "verified",
# "reviewTime",
# "reviewerID",
# "asin",
# "style": {"Format:"}
# "reviewerName",
# "reviewText"
# "summary",
# "unixReviewTime"

### Load the meta data
data = []
with gzip.open('Dataset/meta_Digital_Music.json.gz') as f:
    for l in f:
        data.append(json.loads(l.strip()))
    
# Total length of list, this number equals total number of products
print("Total number of items in the dataset: ", len(data))

In [None]:
# Convert list into pandas dataframe
df = pd.DataFrame.from_dict(data)

# Change list to strings
df.description = df.description.apply(lambda x: ". ".join(x))

# A lot of the descriptions (and other features) contain HTML.
# The function parses and "translates" into plain text descriptions more suitable for analysis
def strip_html(s):
    if not s or s.isspace(): 
        return ''
    try:
        return str(html.fromstring(s).text_content())
    except etree.ParserError: # I am not able to find out why the error occur so i continued by catching the exception. Seem to happen on some empty description strings 
        return ''

df.description = df.description.apply(lambda x: strip_html(x))

# Filter out descriptions shorter than 100 chars
df = df[df['description'].map(lambda d: len(d) >= 100)]

df.head()

In [None]:
print("Total number of products after filtering out: ", len(df))
print("First three product description")
for i in range(3):
    print()
    print(df.iloc[i].title)
    print(df.iloc[i].description)

In [None]:
# Remove empty columns
df.replace("", np.nan, inplace=True)
df.dropna(how='all', axis=1, inplace=True)

# Display final cleaned up pandas dataframe
df.head()

## Adding emotions characteristics of the description

In [None]:
# Applying NRCLex emotions
df['emotion_nrc'] = df.description.apply(lambda x: NRCLex(x).raw_emotion_scores) 

In [None]:
# Suppressing warning about old version of spacy
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    # Applying Spacy affect model emotions
    nlp_affect = spacy.load('Spacy-Affect-Model/affect_ner')
    
df['emotion_spacy'] = df.description.apply(lambda x: Counter([item.label_.lower() for item in nlp_affect(x).ents]))

In [None]:
# Transformer method for emotion recognition
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
tokenizer = AutoTokenizer.from_pretrained("mrm8488/t5-base-finetuned-emotion")
model = AutoModelWithLMHead.from_pretrained("mrm8488/t5-base-finetuned-emotion")
model.to(device)

def get_emotion(text):
  input_ids = tokenizer.encode(text + '</s>', return_tensors='pt').to(device)
  output = model.generate(input_ids=input_ids,
               max_length=2)
  dec = [tokenizer.decode(ids) for ids in output]
  label = dec[0]
  return label

In [None]:
# Applying the transformer method on our dataset
df['emotion_transformer'] = df.description[:1000].apply(lambda x: get_emotion(x)[6:])

In [None]:
# Extracting most significant emotion of a particular description
def get_most_significant_emotion(emotions):
    try:
        sign_emotion = max(emotions, key=emotions.get)
    except ValueError:
        sign_emotion = None
    return sign_emotion

df['most_significant_emotion_nrc'] = df.emotion_nrc.apply(lambda x: get_most_significant_emotion(x))
df['most_significant_emotion_spacy'] = df.emotion_spacy.apply(lambda x: get_most_significant_emotion(x))

df.head()

## Similar items

In [None]:
dfdescription = df.description
descr = defaultdict(list)
 
for idx, row in df.iterrows():
    if row.description in descr[row.asin]:
        print(idx, row.asin, row.description)
    else:
        descr[row.asin].append(row.description)

In [None]:
descr

In [None]:
for key, value in descr.items():
    print(type(value))
    # if len(elem.values())  >1:
    #     print(elem)d

In [None]:
display(descr)

In [None]:
def shingle(aString, q, delimiter=' '):
    """
    Input:
        - aString (str): string to split into shingles
        - q (int)
        - delimiter (str): string of the delimiter to consider to split the input string (default: space)
    Return: list of unique shingles
    """
    all_shingles = []
    if delimiter != '':
        words_list = aString.split(delimiter)
    else:
        words_list = aString
    for i in range (len(words_list)-q+1):
        all_shingles.append(delimiter.join(words_list[i:i+q]))
    return list(set(all_shingles))

In [None]:
ex_string, q = dfdescription.iloc[0], 2
# ex_string, q = "Latin rhythms that will get your kids singing in Spanish. ", 2
ex_shingles = shingle(ex_string, q)
# assert len(ex_shingles) == 7
print('\nInitial string:', ex_string)
print(f'>> Shingles with q = {q} :',ex_shingles)

In [None]:
print(len(dfdescription))
dfdescription.drop_duplicates(inplace=True)
print(len(dfdescription))
# dfdescription

In [None]:
df.head()

In [None]:
# Merge description to reviews data using 'asin'

merged_df = df.merge(df[['asin', 'description']], on='asin', how='left')

In [None]:
merged_df.iloc[15:200]