<a href="https://colab.research.google.com/github/Blistt/Information-Retrieval-System-Synopses/blob/main/SynopsisIRSystem.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Functions to **read** csv with animes' synopsis & **tokenize** them

In [None]:
import nltk
import pandas as pd
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from ast import literal_eval

def reader(path, num_sim_shows):
  df = pd.read_csv(path)
  print(df.shape, 'shows without cleaning')
  df = df[pd.notna(df['similar_shows'])]
  print(df.shape, 'shows after removing null values for similar shows')

  # Discards shows without synopsis
  df = df[pd.notna(df['synopsis'])]
  print(df.shape, 'shows after removing null values for synopsis')
  df['similar_shows'] = df['similar_shows'].str.replace("'",'')\

  # Converts csv cell values from String to List
  df['similar_shows'] = df['similar_shows'].map(lambda x: literal_eval(x))

  # Removes similar shows not in dataset
  all_shows = list(df['uid'])
  total_sim_shows = sum(df['similar_shows'].str.len())
  df['similar_shows'] = df['similar_shows'].map(lambda x: [i for i in x if i in all_shows])
  print('total similar shows:', total_sim_shows)
  print('total removed shows:', total_sim_shows - sum(df['similar_shows'].str.len()))

  # Removes shows without at least 'num_sim_shows' number of similar shows
  more_than = df['similar_shows'].map(lambda x: True if len(x) >= num_sim_shows else False)
  df = df[more_than]
  print(df.shape, 'shows with at least', num_sim_shows, 'similar shows')
  df = df.reset_index(drop=True)

  return df

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
df = reader('/content/drive/MyDrive/SynopsisIRSystem/tokenized_animes_with_userrecs.csv', 1)
print('Sample Synopsis\n', 'Show:', df.loc[387, 'title'] + '\n' + df.loc[387, 'synopsis'])
print()

(3151, 14) shows without cleaning
(3151, 14) shows after removing null values for similar shows
(3151, 14) shows after removing null values for synopsis
total similar shows: 78981
total removed shows: 9692
(3147, 14) shows with at least 1 similar shows
Sample Synopsis
 Show: Sayonara Zetsubou Sensei
 nozomu itoshiki high school teacher pessimist even smallest misfortun send pit rage despair ; `` catastroph '' even lead suicid attempt . sayonara zetsub sensei satir slice-of-lif comedi set modern day , cover variou aspect japanes life cultur nozomu interact student : kiri komori , reclus refus leav school ; abiru kobushi , enigma frequent arriv class sever mysteri injuri ; hyper-optimist kafuuka fuura , nozomu 's polar opposit ; sever unusu girl , eccentr teacher . [ written mal rewrit ]



Creates Bag of Words (Tfidf Matrix)

In [None]:
# Creates a TD-IDF matrix
def TfidfGenerator(df):
  synopses = list(df['synopsis'])

  for i, sample in enumerate(synopses):
    if isinstance(sample,float):
      print(i)
      print(str(sample))
  vectorizer = TfidfVectorizer()
  vectors = vectorizer.fit_transform(synopses)
  feature_names = vectorizer.get_feature_names_out()
  dense = vectors.todense()
  denselist = dense.tolist()
  tfidf_matrix = pd.DataFrame(denselist, columns=feature_names)
  print(tfidf_matrix.head())
  return tfidf_matrix

In [None]:
tfidf_matrix = TfidfGenerator(df)

    00  000  001  0015  0060  0068  0071  0079  0083  0087  ...  zutto  \
0  0.0  0.0  0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...    0.0   
1  0.0  0.0  0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...    0.0   
2  0.0  0.0  0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...    0.0   
3  0.0  0.0  0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...    0.0   
4  0.0  0.0  0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...    0.0   

   zvezda  zwei  éclair  élénoir  état  öhi  österreich  üso  ōtorii  
0     0.0   0.0     0.0      0.0   0.0  0.0         0.0  0.0     0.0  
1     0.0   0.0     0.0      0.0   0.0  0.0         0.0  0.0     0.0  
2     0.0   0.0     0.0      0.0   0.0  0.0         0.0  0.0     0.0  
3     0.0   0.0     0.0      0.0   0.0  0.0         0.0  0.0     0.0  
4     0.0   0.0     0.0      0.0   0.0  0.0         0.0  0.0     0.0  

[5 rows x 19008 columns]


Initializes Knn Algorithm

In [None]:
import numpy as np
from sklearn.neighbors import NearestNeighbors

number_of_shows = 100

knnbr = NearestNeighbors(n_neighbors = number_of_shows, metric = 'euclidean', algorithm = 'ball_tree').fit(tfidf_matrix)
print('knn algorithm finished fitting td-idf matrix')

knn algorithm finished fitting td-idf matrix


Generates recommendations for a given set of queries (test_set)

In [None]:
test_size = 100

test_set = df[:test_size]
test_set = test_set.reset_index(drop=True)
test_set_matrix = tfidf_matrix[:test_size]
test_set_matrix.reset_index(drop=True)


# The set of K Nearest Neighobrs is added as recommendations to each show in the test set
knn = knnbr.kneighbors(test_set_matrix)
test_set.insert(test_set.shape[1], 'recs', '')
for i, show_recs in enumerate(knn[1]):
  # Adds only the id number
  test_set.at[i,'recs'] = (list(df.loc[show_recs, 'uid']))[1:]

print('Test set created with', test_set.shape[0], 'elements')

Test set created with 100 elements


Computes precision and recall

In [None]:
def get_precision_and_recall(test_set):
  for i in range(test_set.shape[0]):
    hits = sum(el in test_set.loc[i, 'recs'] for el in  test_set.loc[i, 'similar_shows'])
    test_set.loc[i, 'precision'] = hits / len(test_set.loc[i, 'recs'])
    test_set.loc[i, 'recall'] = hits / len(test_set.loc[i, 'similar_shows'])
  precision = test_set['precision'].mean()
  recall = test_set['recall'].mean()
  return precision, recall

In [None]:
precision, recall = get_precision_and_recall(test_set)
print('PRECISION:', precision)
print('RECALL:', recall)

PRECISION: 0.07707070707070707
RECALL: 0.13419940220068086


Get Mean Average Precision

In [None]:
def get_mean_average_precision(test_set):

  temp_precision = 0  
  # Iterates over all queries Q
  for i in range(test_set.shape[0]):

    # Iterates over a query's set of relevant results (list of similar shows)
    for sim_show in test_set.loc[i, 'similar_shows']:
      j = test_set.loc[i,'recs'].index(sim_show) + 1   # Gets index of curent relevant show in list of similar shows
      relevant_set = test_set.loc[i,'recs'][:j]
      hits = sum(el in relevant_set for el in  test_set.loc[i, 'similar_shows'])
      query_precision =  (hits / len(test_set.loc[i, 'recs'])) / len(relevant_set)
    
    temp_precision += query_precision / (i+1)

  mean_average_precision = temp_precision / test_set.shape[0]
  return(mean_average_precision)



In [None]:
mean_average_precision = get_mean_average_precision(test_set)
print(mean_average_precision)

9.934086594178099e-07


Test cases of recommendations: shows list of recommendations for a given query

In [None]:
knn = knnbr.kneighbors(tfidf_matrix[0:3])

for i, show in enumerate(knn[1]):
  print('Getting recommendations for:', df.loc[i,'title'])
  for j, index in enumerate(show):
    if j != 0:
      print(j, df.loc[index, 'title'])
  print('-------------------------------------------------------------')

Getting recommendations for: Fullmetal Alchemist: Brotherhood
1 Fullmetal Alchemist
2 Joker Game
3 Escha & Logy no Atelier: Tasogare no Sora no Renkinjutsushi
4 Kaze no Shoujo Emily
5 Da Yu Hai Tang (Movie)
6 Oniichan dakedo Ai sae Areba Kankeinai yo ne!
7 Baccano!
8 No Game No Life: Zero
9 Solty Rei
10 Code Geass: Boukoku no Akito 2 - Hikisakareshi Yokuryuu
11 Cowboy Bebop: Tengoku no Tobira
12 Kyokou Suiri
13 Suisei no Gargantia
14 Kidou Keisatsu Patlabor the Movie
15 Garo: Vanishing Line
16 Blue Drop: Tenshi-tachi no Gikyoku
17 Dororo to Hyakkimaru
18 Kino no Tabi: The Beautiful World
19 Kidou Keisatsu Patlabor
20 Touch
21 Dororo
22 Cowboy Bebop
23 Kaiba
24 Gyo
25 Wan Jie Xian Zong
26 Loveless
27 Sword Art Online: Alicization
28 California Crisis: Tsuigeki no Juuka
29 Gosick
30 Choujin Gakuen Gowcaizer: The Voltage Fighters
31 Shisha no Teikoku
32 Fuyu no Semi
33 Galaxy Angel
34 Heisei Tanuki Gassen Ponpoko
35 Heroic Age
36 Omoide Poroporo
37 Suki de Suki de, Suki de The Animation
3

Evaluates performance of system accross different numbers of recommended shows

In [None]:
def experiment(retrieval_sizes, test_size):
  for retrieval_size in retrieval_sizes:
    knnbr = NearestNeighbors(n_neighbors = retrieval_size, metric = 'euclidean', algorithm = 'ball_tree').fit(tfidf_matrix)

    test_set = df[0:test_size]
    test_set = test_set.reset_index(drop=True)
    test_set_matrix = tfidf_matrix[0:test_size]
    test_set_matrix.reset_index(drop=True)
    knn = knnbr.kneighbors(test_set_matrix)
    test_set.insert(test_set.shape[1], 'recs', '')

    for i, show_recs in enumerate(knn[1]):
      test_set.at[i,'recs'] = (list(df.loc[show_recs, 'uid']))[1:]

    precision, recall = get_precision_and_recall(test_set)
    print('PRECISION with', retrieval_size, 'recommendations:', precision)
    print('RECALL with', retrieval_size, 'recommendations:', recall)
    

In [None]:
experiment([21, 41, 101], 100)

PRECISION with 21 recommendations: 0.11100000000000002
RECALL with 21 recommendations: 0.04114814509122469
PRECISION with 41 recommendations: 0.09849999999999999
RECALL with 41 recommendations: 0.0678132186565307
PRECISION with 101 recommendations: 0.07680000000000001
RECALL with 101 recommendations: 0.1349263579618883
