In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


**Functions to read csv with animes' synopsis**


To save computation time, when tokenization is called, priorly computed tokenized versions of synopsis are read

In [None]:
import nltk
import pandas as pd
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from ast import literal_eval


def reader(path, num_sim_shows=1, show=False):
  df = pd.read_csv(path)
  df = df[['uid', 'title', 'synopsis', 'similar_shows']]
  if show:
    print(df.shape, 'shows without cleaning')
  df = df[pd.notna(df['similar_shows'])]
  if show:
    print(df.shape, 'shows after removing null values for similar shows')

  # Discards shows without synopsis
  df = df[pd.notna(df['synopsis'])]
  if show:
    print(df.shape, 'shows after removing null values for synopsis')
  df['similar_shows'] = df['similar_shows'].str.replace("'",'')\

  # Converts csv cell values from String to List
  df['similar_shows'] = df['similar_shows'].map(lambda x: literal_eval(x))

  # Removes similar shows not in dataset
  all_shows = list(df['uid'])
  total_sim_shows = sum(df['similar_shows'].str.len())
  df['similar_shows'] = df['similar_shows'].map(lambda x: [i for i in x if i in all_shows])

  if show:
    print('total similar shows:', total_sim_shows)
    print('total removed shows:', total_sim_shows - sum(df['similar_shows'].str.len()))

  # Removes shows without at least 'num_sim_shows' number of similar shows
  more_than = df['similar_shows'].map(lambda x: True if len(x) >= num_sim_shows else False)
  df = df[more_than]
  if show:
    print(df.shape, 'shows with at least', num_sim_shows, 'similar shows')
  df = df.reset_index(drop=True)

  return df

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


**Creates Bag of Words (Tfidf Matrix)**

This approach will benefit from tokenization, so tokenized versions of the synopses are retrieved from memory

In [None]:
def read_tfidf(path, show=True):
  # Reads tokenized dataset of anime shows
  df = reader(path + '/tokenized_animes_with_userrecs.csv', num_sim_shows=1)
  if show:
    print('Dataset with', df.shape[0], 'different shows')
    print('\n' + 'Sample Synopsis\n', 'Show:', df.loc[55, 'title'] + '\n' + df.loc[55, 'synopsis'])
    print('----------------------------------------------------------------------------------------------------------------------------------------------------------' + '\n')
  return df

In [None]:
# Creates a TD-IDF matrix
def TfidfGenerator(df, show=True):
  synopses = list(df['synopsis'])
  vectorizer = TfidfVectorizer()
  vectors = vectorizer.fit_transform(synopses)
  feature_names = vectorizer.get_feature_names_out()
  dense = vectors.todense()
  denselist = dense.tolist()
  tfidf_matrix = pd.DataFrame(denselist, columns=feature_names)

  if show:
    print('Language Encodings Head')
    print(tfidf_matrix.head())
    print('Shape of dataset of Language Encodings:', tfidf_matrix.shape)
    print('----------------------------------------------------------------------------------------------------------------------------------------------------------' + '\n')
  return tfidf_matrix.values, df

**Creates Dynamic Embedding using a Pre-Trained Universal Sentence Encoder (USE)**


This method peforms best without performing tokenization, so untokenized version of shows synopses are retrieved from memory

In [None]:
def read_USE(path, show=True):
  # Reads untokenized dataset of anime shows
  df = reader(path +'/animes_with_userrecs_cleaned.csv', num_sim_shows=1)
  df['synopsis'] = df['synopsis'].replace(r'\r\n', '', regex=True)
  df['synopsis'] = df['synopsis'].str.replace('  [Written by MAL Rewrite]', '', regex=False)
  df['synopsis'] = df['synopsis'].str.split('.')
  df['synopsis'] = df['synopsis'].apply(lambda x: [y for y in x if y != ''])
  if show:
    print('Dataset with', df.shape[0], 'different shows')
    print('\n', 'Sample Synopsis\n', 'Show:', df.loc[55, 'title'])
    print(df.loc[55, 'synopsis'])
    print('----------------------------------------------------------------------------------------------------------------------------------------------------------' + '\n')
  return df

In [None]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np

def USEGenerator(df, show=True):
  # Downloads pre-trained USE model
  module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
  model = hub.load(module_url)

  # Creates embeddings for list of show synopses
  synopses = list(df['synopsis'])
  USE_matrix = []
  for synopsis in synopses:
    embedding = model(synopsis)[0].numpy()
    USE_matrix.append(embedding)
  USE_matrix = np.array(USE_matrix)

  if show:
    print('Language Encodings Head')
    print(USE_matrix[0:5])
    print('Shape of dataset of Language Encodings:', USE_matrix.shape)
    print('----------------------------------------------------------------------------------------------------------------------------------------------------------' + '\n')
  return USE_matrix, df

**Creates Training/Testing dataset**


Given the small size of this project's datset allowed for Leave One Out-Cross Validation, the training and testing dataset are the same



In [None]:
def create_dataset(test_size, df, text_encodings):
  dataset = df[:test_size]
  dataset = dataset.reset_index(drop=True)
  dataset_matrix = text_encodings[:test_size]
  if isinstance(dataset_matrix, pd.DataFrame):
    dataset_matrix.reset_index(drop=True)

  # Creates array of true relevance labels with 0s for tue negatives (non-relevant shows) and 1s for true positive (relevant shows)
  dataset.insert(dataset.shape[1], 'true', '')
  for i in range(dataset.shape[0]):
    dataset.at[i, 'similar_shows'] = [df.loc[df['uid']==j].index[0] for j in dataset.at[i, 'similar_shows']]
    trues = np.zeros(df.shape[0])
    trues[dataset.at[i, 'similar_shows']] = 1.0
    dataset.at[i, 'true'] = trues

  # Initializes auxiliary columns for optional tasks
  dataset.insert(dataset.shape[1], 'pred', '')
  dataset.insert(dataset.shape[1], 'recs', '')
  dataset.insert(dataset.shape[1], 'recs_dist', '')
  return dataset, dataset_matrix

**Retrieval Method**

Retrieves K shows via Nearest Neighbor search with Cosine Distance

In [None]:
import numpy as np
import sklearn.neighbors
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_distances

def retrieve_knn(dataset, dataset_matrix, text_encodings, n_neighbors=30, show=False):
  # Initializes KNN algorithm
  knnbr = NearestNeighbors(n_neighbors=n_neighbors, metric='cosine', algorithm ='brute').fit(text_encodings)


  # The set of K Nearest Neighobrs is added as recommendations to each show in the test set
  distances, indices = knnbr.kneighbors(dataset_matrix)

  # Construct training set
  X = []
  for i, show_recs in enumerate(indices):
    # Creates additional array of predictions for Mean Average Prediction Computation
    preds = np.zeros(text_encodings.shape[0])
    preds[show_recs] = 1
    dataset.at[i,'pred'] = preds

    # Creates query-document pairings as entires from retrieved shows, each with ids, relevance prediction, distances,
    # and ground truth labels
    for j, rec in enumerate(show_recs[1:]):
      entry = [i, dataset.loc[i, 'synopsis'], rec, 1, distances[i][j+1], int(rec in dataset.loc[i, 'similar_shows'])]
      X.append(entry)

    # Creates query-document pairiings as entires from relevant but non-retrieved shows each with ids, relevance prediction,
    # distances, and ground truth labels
    sims = [x for x in dataset.loc[i, 'similar_shows'] if x not in show_recs]
    for j, true in enumerate(sims):
      distance = cosine_distances([text_encodings[i]], [text_encodings[j]])
      entry = [i, dataset.loc[i, 'synopsis'], true, 0, distance, 1]
      X.append(entry)
  X = pd.DataFrame(X)
  X.columns = ['doc_id', 'query', 'show_id', 'pred', 'distance', 'true']

  if show:
    print('Constructed training set with', X.shape[0], 'query-document pairings')
    print(X.head())

  return X, dataset, knnbr

**Metrics**

Computes precision, recall and the F1 measure

In [None]:
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

def get_precision_recall_f1(dataset):
  precision = precision_score(dataset['true'], dataset['pred'])
  recall = recall_score(dataset['true'], dataset['pred'])
  f1_score = 2 * (precision * recall) / (precision + recall)
  return precision, recall, f1_score

Mean Average Precision

In [None]:
from sklearn.metrics import average_precision_score
from sklearn.metrics import PrecisionRecallDisplay

def get_mean_average_precision(dataset, retrieval_size):
  if retrieval_size < dataset.shape[0]-1:
    return "Too small retrieval size for computing MAP"

  for i in range(1):
      dataset.loc[i, 'average_precision'] = average_precision_score(dataset.loc[i, 'true'], dataset.loc[i, 'pred'])

  mean_average_precision = dataset['average_precision'].mean()

  return mean_average_precision

R-Precision

In [None]:
def get_R_Precision(dataset, retrieval_size):
  if retrieval_size < 200:
    return "Too small retrieval size for computing R-Precision"

  R_Precision = []
  # Iterate over all unique queries
  for i in range(dataset['doc_id'].unique().shape[0]):
    recs = dataset[(dataset['doc_id']==i) & (dataset['pred']==1)].sort_values(by='distance')
    rels_len = dataset[(dataset['doc_id']==i) & (dataset['true']==1)].shape[0]
    R_Precision.append(recs.iloc[:rels_len, dataset.columns.get_loc('true')].sum() / rels_len)

  R_Precision = np.array(R_Precision).mean()
  return R_Precision

**Experiment**

Retrieves documents for a specified retrieval size and evaluates performance of the system

In [None]:
def experiment(path, retrieval_size=30, test_set_size=3147, text_encodings='tfidf'):

  # Extract desired language encodings
  if text_encodings == 'tfidf':
    df = read_tfidf(path)
    text_encodings, df = TfidfGenerator(df)
  else:
    df = read_USE(path)
    text_encodings, df = USEGenerator(df)

  # Generate training and test set
  dataset, dataset_matrix = create_dataset(test_set_size, df, text_encodings)

  # Retrieve results with KNN
  X, map_X, knnbr = retrieve_knn(dataset, dataset_matrix, text_encodings, retrieval_size+1)

  # Compute Evaluation Metrics - *Due to my sloppy implementation, R-precision and MAP can only be computed if retrieval size is 3146
  precision, recall, f1 = get_precision_recall_f1(X)
  mean_average_precision = get_mean_average_precision(map_X, retrieval_size)
  R_Precision = get_R_Precision(X, retrieval_size)

  print('PRECISION @ ' + str(retrieval_size) + ':',  precision)
  print('RECALL @ ' + str(retrieval_size) + ':', recall)
  print('F1 SCORE @ ' + str(retrieval_size) + ':', f1)
  print('Mean Average Precision:', mean_average_precision)
  print('R_Precision:', R_Precision)
  print('----------------------------------------------------------------------------------------------------------------------------------------------------------' + '\n')

In [None]:
text_encoding = 'USE'     # Select between 'USE' and 'tdidf'
path = '/content/drive/MyDrive/SynopsisIRSystem'
experiment(path, retrieval_size=50, text_encodings=text_encoding)

Dataset with 3147 different shows

 Sample Synopsis
 Show: Slam Dunk
['Hanamichi Sakuragi, infamous for this temper, massive height, and fire-red hair, enrolls in Shohoku High, hoping to finally get a girlfriend and break his record of being rejected 50 consecutive times in middle school', ' His notoriety precedes him, however, leading to him being avoided by most students', ' Soon, after certain events, Hanamichi is left with two unwavering thoughts: "I hate basketball," and "I desperately need a girlfriend', '"  One day, a girl named Haruko Akagi approaches him without any knowledge of his troublemaking and asks him if he likes basketball', ' Hanamichi immediately falls head over heels in love with her, blurting out a fervent affirmative', ' She then leads him to the gymnasium, where she asks him if he can do a slam dunk', ' In an attempt to impress Haruko, he makes the leap, but overshoots, instead slamming his head straight into the blackboard', " When Haruko informs the basketball

**OPTIONAL - Test Your Own Query**

This method supports the testing of a query written by the user. Just modify the string value for the *query* value in the bottom cell

In [None]:
import pandas as pd
from sklearn.metrics.pairwise import pairwise_distances
import textwrap

def test_your_own_query(query, path, retrieval_size, text_encodings='tfidf'):

  query_df = pd.DataFrame([{'synopsis': query}])

  # Extract desired language encodings
  if text_encodings == 'tfidf':
    df = read_tfidf(path, show=False)
    df = pd.concat([query_df.reindex(columns=df.columns), df], ignore_index=True)
    text_encodings, df = TfidfGenerator(df, show=False)
  else:
    df = read_USE(path, show=False)
    text_encodings, df = USEGenerator(df, show=False)
  df = reader(path+'/animes_with_userrecs_cleaned.csv', show=False)

  # Get distances
  distances = pairwise_distances(text_encodings[0].reshape(1, -1), text_encodings, metric='cosine').ravel()

  # Sort and obtain K shortest distances (K Nearest Neighbors)
  knn_indices = np.argsort(distances)[:retrieval_size]
  knn = df.iloc[knn_indices]
  knn['distance'] = distances[knn_indices]
  knn = knn[1:]

  for i, row in knn.iterrows():
    print(row['title'], '  ---- distance   ', row['distance'])
    print(textwrap.fill(row['synopsis']))
    print('----------------------------------------------------------------------------------------------------------------------------------------------------------' + '\n')

In [None]:
query = 'I want vampiros and werewolves, and castles, and shotguns and a red head'
retrieval_size = 10
path = '/content/drive/MyDrive/SynopsisIRSystem'

test_your_own_query(query, path, retrieval_size+1, text_encodings='tfidf')

Morita-san wa Mukuchi.   ---- distance    0.9077696364670396
Morita Mayu, a high school girl. She is extremely reticent and her
silence and habit of looking at people's eyes straightly sometimes
cause misunderstanding. The reason behind it is not because she
doesn’t like to talk nor because she has nothing to say. The reason
she rarely speaks is due to the fact she thinks too much before
speaking, thus losing the timing to speak altogether. But she lives a
happy school life with her classmates.
----------------------------------------------------------------------------------------------------------------------------------------------------------

Double Decker! Doug & Kirill   ---- distance    0.9127615817395552
The once peaceful city-state of Lisvalletta has found itself beset by
a dangerous new drug called Anthem. The side effects of the drug allow
the user to enter a state of Overdrive, wherein they mutate into
superpowered beasts with inhuman abilities. With the police powerless
t

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  knn['distance'] = distances[knn_indices]
