# Section 2: Sentence matching

## Approach 1 - sentence level matching
The first approach taken is to split each review into sentence level data and then perform matching on a 'user -> review sentence' basis. It is hypothesised that given the complex nature of reviews that this will perform better than creating a document level match.

Features:
*   Vector space model - count vectors (1-4gram)
*   Vector space model - tfidf vectors (1-4gram)
*   Topic model - LSA (Latent Semantic Analysis)
*   Topic model - LDA (Latent Dirichlet Allocation)
*   Embeddings - pre-trained
*   Embeddings - corpus-trained

Distance measures:
*   Standard distance measures e.g. cosine etc.
*   Non-standard distance measures e.g. Word Movers Distance


## Standard distance measures

In [0]:
from math import*
from decimal import Decimal
 
class Similarity():
  
  def euclidean_distance(self,x,y):
          return sqrt(sum(pow(a-b,2) for a, b in zip(x, y)))

  def manhattan_distance(self,x,y):
          return sum(abs(a-b) for a,b in zip(x,y))

  def minkowski_distance(self,x,y,p_value):
          return self.nth_root(sum(pow(abs(a-b),p_value) for a,b in zip(x, y)),
             p_value)

  def cosine_similarity(self,x,y):
          numerator = sum(a*b for a,b in zip(x,y))
          denominator = self.square_rooted(x)*self.square_rooted(y)
          return round(numerator/float(denominator),3)

  def jaccard_similarity(self,x,y):
          intersection_cardinality = len(set.intersection(*[set(x), set(y)]))
          union_cardinality = len(set.union(*[set(x), set(y)]))
          return intersection_cardinality/float(union_cardinality)
        
  def nth_root(self,value, n_root):
          root_value = 1/float(n_root)
          return round (Decimal(value) ** Decimal(root_value),3)
      
  def square_rooted(self,x): 
          return round(sqrt(sum([a*a for a in x])),3)


## Vector space features and standard distance measures

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer
measures = Similarity()
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords', quiet=True, raise_on_error=True)
stop_words = set(nltk.corpus.stopwords.words('english'))
tokenized_stop_words = nltk.word_tokenize(' '.join(nltk.corpus.stopwords.words('english')))
 
class LemmaTokenizer(object):
  
    def __init__(self):
        nltk.download('punkt', quiet=True, raise_on_error=True)
        self.stemmer = nltk.stem.PorterStemmer()
        
    def _stem(self, token):
        if (token in stop_words):
            return token  # Solves error "UserWarning: Your stop_words may be inconsistent with your preprocessing."
        return self.stemmer.stem(token)
        
    def __call__(self, line):
        tokens = nltk.word_tokenize(line)
        tokens = (self._stem(token) for token in tokens)  # Stemming
        return list(tokens)


def distance_vectors(corpus, stringlist, vect=TfidfVectorizer, dist=measures.euclidean_distance):
  
  ###vectorizer
  t_vectorizer = vect(tokenizer=LemmaTokenizer(),
                      strip_accents='unicode',
                      stop_words=tokenized_stop_words,
                      lowercase=True,
                      ngram_range=(1,4),
                      analyzer='word')

  X_t = t_vectorizer.fit_transform(corpus)
  test_t = t_vectorizer.transform(stringlist)
  
  ###similarity calculation
  scores = []
  for i in range(0,len(corpus)):    
    scores.append(dist(test_t.toarray()[0],X_t[i].toarray()[0]))

  ###print top 3 most similar
  indices = np.array(scores).argsort()[0:3]
  for i in indices:
    values = album_corpus.iloc[i][1:3]
    print(values.values)

In [0]:
###mount drive
from google.colab import drive
drive.mount('/content/gdrive')

###change directory
%cd gdrive/My Drive/Colab Notebooks/album_reviews

album_corpus = pickle.load( open( "album_corpus.pkl", "rb" ) )

In [0]:
###vector space test - working well! returning logical results
distance_vectors(corpus=album_corpus['review_text'],stringlist=['a heavy drum section followed by uplifting chorus, with rock and roll influences'])


## Embedding features and non-standard distance measures

WMD is an embedding specific distance measure; it assesses the "distance" between two documents in a meaningful way, even when they have no words in common, by using word2vec vector embeddings of words.

In [0]:
import gensim
from gensim.models import Word2Vec
from gensim.models import Phrases
from nltk.corpus import stopwords
from nltk import download
download('stopwords')
stop_words = stopwords.words('english')

def distance_embeddings(corpus, stringlist,trained=False):
  
  ###pre-processing
  def pre_processor(list):
    pp_corpus=[]
    for i in list:
      i = i.lower().split()
      i = [w for w in i if w not in stop_words]
      pp_corpus.append(i)
    return pp_corpus 

  pp_corpus = pre_processor(corpus)


  ###word embeddings
  #bigram_transformer = Phrases(album_corpus['review_text'])
  if trained == False:
    word_model = Word2Vec(pp_corpus, min_count=2, size=100, window=5, iter=100)
  else:
    word_model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)
  ###similarity calculation
  scores = []
  for i in range(0,len(corpus)):    
    scores.append(word_model.wmdistance(stringlist[0],pp_corpus[i]))


  ###print top 3 most similar
  indices = np.array(scores).argsort()[0:3]
  for i in indices:
    values = album_corpus.iloc[i][1:3]
    print(values.values)

In [0]:
###this is not working well...
distance_embeddings(corpus=album_corpus['review_text'],stringlist=['a heavy drum section followed by uplifting chorus, with rock and roll influences'])


# Next steps

lookup list for UMG artists \\
alternative websites - any decent music.com \\
lyric search \\
artist cluster \\