<a href="https://colab.research.google.com/github/AudioburstResearch/sequitur-g2p/blob/master/evaluate_duplicates_filtering_local.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import tensorflow as tf
import tensorflow_hub as hub
import json
import numpy as np
import pandas as pd
import requests
from dataclasses import dataclass
import datetime
from typing import List
import en_core_web_sm
import spacy

In [None]:
module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
model = hub.load(module_url)
nlp = spacy.load("en_core_web_sm")


def embed(input):
    return model(input)


def text_similarity(existing_titles, new_title):
    existing = embed(existing_titles)
    new = embed(new_title)
    return np.inner(existing, new)


In [None]:
path = 'drive/My Drive/bursts_list_for_tagging_200_210 - 21 April.tsv'
data = pd.read_csv(path, encoding='utf8', sep = '\t')

In [None]:
data

In [None]:
@dataclass
class PlaylistBurst:
    burst_id: str
    text: str
    title: str
    watson_ents: List[str]
    publication_date : str
       
    def to_json(self):
        return json.dumps(self.__dict__)

    @classmethod
    def from_json(cls, json_str):
        json_dict = json.loads(json_str)
        return cls(**json_dict)

    def __str__(self):
        return str(self.title) + '\n' + str(self.text) + '\n' + str(self.burst_id) 

In [None]:
def get_playlist_burst(burst_id):
    sapi_url = "http://sapi.audioburst.com/v2/burst?burstId={0}&appKey=search.audioburst".format(burst_id)
    # get the data from sapi
    r = requests.get(sapi_url)
    burst_data = json.loads(r.text)
    if 'error' not in burst_data.keys():
        burst_data = burst_data['bursts'][0]
        ents = burst_data['entities']
        ents.extend(burst_data['keywords'])
        pub_date = burst_data['publicationDateISO']
        burst_1 = PlaylistBurst(title=burst_data['title'], text=burst_data['text'], burst_id=burst_id, watson_ents= ents, publication_date=pub_date)
    return burst_1


In [None]:

def evaluate_duplicates_filtering_for_playlist(playlist_frame, id_to_burst):
  #cur_id_to_burst = {}
  #print("total number of bursts in playlist: " + str(playlist_frame.shape[0]))
  stories = playlist_frame.story.unique()
  #print("total number of stories in playlist: " + str(len(stories)))
  # if the playlist has no duplicates
  if (playlist_frame.shape[0] == len(stories)):
    print("no dups in this playlist")
    print(playlist_frame.shape[0], len(stories))
  else:
    filtered = 0
    not_filtered = 0
    for story in stories:
      story_frame = playlist_frame.loc[playlist_frame['story'] == story]
      # we need to filter some of the bursts here 
      if (story_frame.shape[0] > 1):
        for (indx1,row1),(indx2,row2) in zip(story_frame[:-1].iterrows(),story_frame[1:].iterrows()):
          burst_1 = id_to_burst[row1.burst_id]
          #print(burst_1)
          burst_2 = id_to_burst[row2.burst_id]
          #id_to_burst[row2.burst_id] = burst_2
          texts_similarity = text_similarity([burst_1.text], [burst_2.text])
          titles_similarity = text_similarity([burst_1.title], [burst_2.title])
          if titles_similarity >= titles_similarity_threshold or texts_similarity > text_similarity_threshold:
            entities1 = nlp(burst_1.text)
            entities2 = nlp(burst_2.text)
            entities1 = [ent.text for ent in entities1.ents]
            entities2 = [ent.text for ent in entities2.ents]
            common = [value for value in entities1 if value in entities2]
            if len(common) >= 1:
                filtered += 1
            else:
              not_filtered += 1
          else:
            not_filtered += 1
    print( playlist_frame.shape[0], len(stories), playlist_frame.shape[0] - len(stories) , filtered, not_filtered )
  
        


  return id_to_burst, playlist_frame.shape[0], len(stories)

In [None]:
# the function that will be used in the server
def add_burst_to_playlist(new_burst, playlist_bursts):
    # if playlist bursts contain a burst with either title or text "", it's removed from the playlist
    playlist_bursts_validated = []
    for burst in playlist_bursts:
        if burst.title != "" and burst.text != "":
            playlist_bursts_validated.append(burst)
    should_remove = []
    min_text_len = min([len(playlist_burst.text) for playlist_burst in playlist_bursts_validated])
    min_text_len = min(min_text_len, text_scope)
    titles = [playlist_burst.title for playlist_burst in playlist_bursts_validated]
    texts = [' '.join(playlist_burst.text.split()[:min_text_len]) for playlist_burst in playlist_bursts_validated]
    titles_similarity = text_similarity(titles, [new_burst.title])
    texts_similarity = text_similarity(texts, [new_burst.text])
    for i in range(len(playlist_bursts_validated)):
        # this was changed to "and"
        # print(titles_similarity[i], texts_similarity[i])
        if titles_similarity[i] >= titles_similarity_threshold or texts_similarity[i] > text_similarity_threshold:
            entities1 = nlp(playlist_bursts[i].text)
            entities2 = nlp(new_burst.text)
            entities1 = [ent.text for ent in entities1.ents]
            entities2 = [ent.text for ent in entities2.ents]
            common = [value for value in entities1 if value in entities2]
            if len(common) >= 1:
                should_remove.append([playlist_bursts_validated[i].burst_id, titles_similarity[i][0], texts_similarity[i][0]])
    should_remove = sorted(should_remove, key=lambda x: (x[1] + x[2]), reverse=True)
    # print(should_remove)
    should_remove = [x[0] for x in should_remove]
    return should_remove

In [None]:
def simulate_burst_inseration_to_playlist(playlist_frame, id_to_burst):
  current_playlist = []
  true_positive = 0
  false_positive = 0
  true_negative = 0 
  false_negative = 0
  pub_dates = []
  # adding pub dates to the dataframe
  for (indx,row) in playlist_frame.iterrows():
    pub_date = id_to_burst[row.burst_id].publication_date
    pub_dates.append(pub_date)
  playlist_frame['pub_dates'] = pub_dates 
  # sorting the playlist by publication dates
  playlist_frame = playlist_frame.sort_values(by = ['pub_dates'])
  # adding the first burst into the playlist:
  current_playlist.append(id_to_burst[playlist_frame.iloc[0].burst_id])
  for i in range(1, len(pub_dates)):
    # for every burst, compare the burst to the previous bursts inserted to the playlist
    # if so, the most similar burst will be removed from the playlist
    new_burst = id_to_burst[playlist_frame.iloc[i].burst_id]
    res = add_burst_to_playlist(new_burst=new_burst, playlist_bursts=current_playlist)
    if len(res) >= 1:
      new_burst_story = playlist_frame.iloc[i].story
      removed_burst_story = playlist_frame.loc[playlist_frame['burst_id'] == res[0]].iloc[0].story
      if (new_burst_story == removed_burst_story):
        true_positive += 1
      else:
        false_positive += 1
      #print(new_burst_story, removed_burst_story)
      index = max([i if res[0] == x.burst_id else -1 for i,x in enumerate(current_playlist)]) 
      current_playlist.pop(index)
    current_playlist.append(new_burst)
  print(true_positive, false_positive)
  return true_positive, false_positive

In [None]:
def create_id_to_burst(all_playlists_frame):
  id_to_burst = {}
  for (indx,row) in all_playlists_frame.iterrows():
    id_to_burst[row.burst_id] = get_playlist_burst(row.burst_id)
  return id_to_burst

In [None]:
id_to_burst

In [None]:
def plot_playlist_size_and_stories(playlist_frame):
  

Get the number of bursts within every playlist,
number of stories,
how many bursts should have been filtered,
how many were filtered,
how many possible comparisons we can have within the playlist and how many of them were positive

In [None]:
data['unique_playlist'] = data.apply(lambda row: str(row.playlist_id) + 
                                  row.date, axis = 1) 

playlist_names = data.unique_playlist.unique()
title_thresholds = [0.4, 0.45, 0.5, 0.55, 0.6, 0.65]
text_thresholds = [0.4, 0.45, 0.5, 0.55, 0.6, 0.65]
text_scopes = [20, 30, 40, 50]
id_to_burst = create_id_to_burst(data)
for titles_similarity_threshold in title_thresholds:
  for text_similarity_threshold in text_thresholds:
    for text_scope in text_scopes:
      true_positive_total = 0
      false_positive_total = 0
      print(titles_similarity_threshold, text_similarity_threshold, text_scope)
      for playlist in playlist_names:
        playlist_frame = data.loc[data['unique_playlist'] == playlist]
        playlist_frame = playlist_frame.reset_index()
        id_to_burst , playlist_len, num_of_stories = evaluate_duplicates_filtering_for_playlist(playlist_frame, id_to_burst)
        true_positive, false_positive = simulate_burst_inseration_to_playlist(playlist_frame, id_to_burst)
        true_positive_total += true_positive
        false_positive_total += false_positive
        '''
        if (playlist_len != num_of_stories):
          print (true_positive/(playlist_len-num_of_stories))
          if(true_positive > 0):
            print (false_positive/true_positive)
            '''
      print(true_positive_total/103)
      print(false_positive_total/true_positive_total)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
11 11
0 1
10 7 3 3 0
2 3
10 8 2 2 0
2 0
13 12 1 0 1
0 1
14 10 4 2 2
2 3
no dups in this playlist
15 15
0 3
12 10 2 2 0
0 5
11 9 2 1 1
1 2
16 14 2 0 2
0 5
16 11 5 4 1
4 0
12 11 1 1 0
1 1
10 9 1 0 1
0 3
11 6 5 1 4
2 0
13 8 5 4 1
3 4
12 11 1 1 0
1 3
26 19 7 7 0
5 2
20 15 5 5 0
4 2
19 17 2 2 0
2 2
12 10 2 1 1
2 2
0.6893203883495146
1.1267605633802817
0.5 0.65 40
17 14 3 3 0
3 4
24 21 3 2 1
0 7
36 20 16 10 6
14 3
18 14 4 4 0
4 3
14 13 1 0 1
0 3
23 19 4 2 2
2 6
15 14 1 0 1
0 1
17 13 4 4 0
4 2
16 13 3 3 0
3 1
18 14 4 3 1
2 5
15 11 4 3 1
3 2
20 14 6 6 0
5 2
no dups in this playlist
11 11
0 1
10 7 3 3 0
2 3
10 8 2 2 0
2 0
13 12 1 0 1
0 1
14 10 4 2 2
2 3
no dups in this playlist
15 15
0 3
12 10 2 2 0
0 5
11 9 2 1 1
1 2
16 14 2 0 2
0 5
16 11 5 4 1
4 0
12 11 1 1 0
1 1
10 9 1 0 1
0 3
11 6 5 1 4
2 0
13 8 5 4 1
3 4
12 11 1 1 0
1 3
26 19 7 7 0
6 2
20 15 5 5 0
4 2
19 17 2 2 0
2 2
12 10 2 1 1
2 2
0.6990291262135923
1.125
0.5 0.65 50
17 14 

In [None]:
print(true_positive_total/103)
print(false_positive_total/true_positive_total)

In [None]:
playlist_frame = data.loc[data['unique_playlist'] == playlist_names[3]]
playlist_frame = playlist_frame.reset_index()
playlist_frame = simulate_burst_inseration_to_playlist(playlist_frame, id_to_burst)

#

In [None]:
playlist_frame

In [None]:
id_to_burst

In [None]:
playlist = playlist_names[2]
playlist_frame = data.loc[data['unique_playlist'] == playlist]
for (indx1,row1) in playlist_frame[:-1].iterrows():
  for (indx2, row2) in playlist_frame[indx1+1:].iterrows():
    print(indx1, indx2)
