# **Importing and Installing Dependencies**

In [1]:
import pandas as pd
import spacy
import re
import numpy as np
import gensim.downloader
from gensim.models import Word2Vec
import matplotlib.pyplot as plt
import pickle


In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
path = '/content/drive/MyDrive/Text Mining Project/'
with open(path+'anime_summarized.pkl', 'rb') as f:
    anime_df = pickle.load(f)

We divide the datasets in two parts, one with the genres and the other that does not have the genres.

In [None]:
anime_df_genre = anime_df[anime_df['Genres'].notnull()]
anime_df_nogenre = anime_df[anime_df['Genres'].isnull()]

# **Data Preprocessing**

In [None]:
anime_df_nogenre['text'] = anime_df_nogenre['Name'] + ' ' + anime_df_nogenre['Synopsis']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  anime_df_nogenre['text'] = anime_df_nogenre['Name'] + ' ' + anime_df_nogenre['Synopsis']


In [None]:
def concat_text(row):

  if not isinstance(row['Producers'], float):
    row['text'] = row['text'] + ' ' + row['Producers']

  if not isinstance(row['Studios'], float):
    row['text'] = row['text'] + ' ' + row['Studios']

  return row

anime_df_nogenre = anime_df_nogenre.apply(concat_text, axis = 1)

In [None]:
anime_df_nogenre.head()

Unnamed: 0,Name,Genres,Synopsis,Producers,Studios,summarized_synopsis,text
9261,Mahou no Princess Minky Momo,,"Momo is the princess of Fenarinarsa, a land of...",Yomiko Advertising,Ashi Productions,She's ready and willing to help anyone she can...,Mahou no Princess Minky Momo Momo is the princ...
9262,Hikaru no Go: Hokuto Hai e no Michi,,Hikaru is given a phone call asking him to app...,Dentsu,Pierrot,Hikaru is given a phone call asking him to app...,Hikaru no Go: Hokuto Hai e no Michi Hikaru is ...
9263,Shinshaku Sengoku Eiyuu Densetsu: Sanada Juu Y...,,"In the Fifth Year of the Keicho era, the Tokug...",Magic Capsule,T.P.O,Hidetada Tokugawa has come to the doorsteps of...,Shinshaku Sengoku Eiyuu Densetsu: Sanada Juu Y...
9264,Nitaboh,,"At a young age, Nitaroh is stricken with an il...",,WAO World,"In time, with the help of friends old and new,...","Nitaboh At a young age, Nitaroh is stricken wi..."
9265,Flag,,"In 20xx, a civil war broke out in a small coun...",Aniplex,The Answer Studio,But a picture taken by accident in the battlef...,"Flag In 20xx, a civil war broke out in a small..."


In [None]:
nlp = spacy.load("en_core_web_sm")
stop_words = set(nlp.Defaults.stop_words)
stop_words.add('new')
stop_words.add('find')
stop_words.add('world')
stop_words.add('year')
stop_words.add('life') # very common words

In [None]:
def clean_text(text):
  text = text.replace('\n',' ') #remove newlines
  text = text.lower() #lowercase
  text = re.sub(r"\([Ss]ource: [A-Za-z0-9]*\)", "", text) #remove the source of the synopsis
  text = re.sub(r"[^a-z\s]+"," ",text) #remove puntuaction and symbols
  text = " ".join(re.split("\s+", text, flags=re.UNICODE)) #remove multiple whitespaces
  text = re.sub(r'\b\w\b', ' ', text) #remove single letters
  return text

def preprocess_text(text):
  #lemmatize
  doc = nlp(text)
  text = " ".join([token.lemma_ for token in doc])
  text = text.split(' ')
  #remove stopwords
  lst=[]
  for token in text:
    if token not in stop_words:
        lst.append(token)
  text = " ".join([word for word in lst])
  return text

In [None]:
anime_df_nogenre['cleaned_synopsis'] = anime_df_nogenre['text'].apply(lambda x: clean_text(x))

In [None]:
anime_df_nogenre['preprocessed_synopsis'] = anime_df_nogenre['cleaned_synopsis'].apply(lambda x: preprocess_text(x))

In [None]:
anime_df_nogenre['preprocessed_synopsis'] = anime_df_nogenre['preprocessed_synopsis'].apply(lambda x: clean_text(x))

In [None]:
anime_df_nogenre.head()

Unnamed: 0,Name,Genres,Synopsis,Producers,Studios,summarized_synopsis,text,cleaned_synopsis,preprocessed_synopsis
9261,Mahou no Princess Minky Momo,,"Momo is the princess of Fenarinarsa, a land of...",Yomiko Advertising,Ashi Productions,She's ready and willing to help anyone she can...,Mahou no Princess Minky Momo Momo is the princ...,mahou no princess minky momo momo is the princ...,mahou princess minky momo momo princess fenari...
9262,Hikaru no Go: Hokuto Hai e no Michi,,Hikaru is given a phone call asking him to app...,Dentsu,Pierrot,Hikaru is given a phone call asking him to app...,Hikaru no Go: Hokuto Hai e no Michi Hikaru is ...,hikaru no go hokuto hai no michi hikaru is g...,hikaru hokuto hai michi hikaru phone ask appea...
9263,Shinshaku Sengoku Eiyuu Densetsu: Sanada Juu Y...,,"In the Fifth Year of the Keicho era, the Tokug...",Magic Capsule,T.P.O,Hidetada Tokugawa has come to the doorsteps of...,Shinshaku Sengoku Eiyuu Densetsu: Sanada Juu Y...,shinshaku sengoku eiyuu densetsu sanada juu yu...,shinshaku sengoku eiyuu densetsu sanada juu yu...
9264,Nitaboh,,"At a young age, Nitaroh is stricken with an il...",,WAO World,"In time, with the help of friends old and new,...","Nitaboh At a young age, Nitaroh is stricken wi...",nitaboh at young age nitaroh is stricken wit...,nitaboh young age nitaroh stricken illness lea...
9265,Flag,,"In 20xx, a civil war broke out in a small coun...",Aniplex,The Answer Studio,But a picture taken by accident in the battlef...,"Flag In 20xx, a civil war broke out in a small...",flag in xx civil war broke out in small co...,flag xx civil war break small country asia spi...


In [None]:
anime_df_nogenre['preprocessed_synopsis'] = anime_df_nogenre['preprocessed_synopsis'].apply(lambda x: clean_text(x))

## **Inference**

After the preprocessing, we can make inference on the genres that we do not have in our dataset.

We start by recalling the utils defined in the previous notebooks.

In [None]:
with open(path+ '/models/' + 'multilabel_binarizer.pkl', 'rb') as f:
    multilabel_binarizer = pickle.load(f)

In [None]:
with open(path+ '/models/' + 'tfidf_vectorizer.pkl', 'rb') as f:
    tfidf_vectorizer = pickle.load(f)

In [None]:
with open(path+ '/models/' + 'best_ml_model.pkl', 'rb') as f:
    clf = pickle.load(f)

We vectorize the synopsis with the *TF-IDF Vectorizer*.

In [None]:
X_vectorized = tfidf_vectorizer.transform(anime_df_nogenre['preprocessed_synopsis'])

In [None]:
X_vectorized.shape

(483, 1077)

We reconstruct the *Genres* column according to the predictions of the model.

In [None]:
anime_df_nogenre['Genres'] = list(multilabel_binarizer.inverse_transform(clf.predict(X_vectorized)))

In [None]:
anime_df_nogenre['Genres'] = anime_df_nogenre['Genres'].apply(lambda x: list(x))

We reconstruct the original dataframe.

In [None]:
anime_df = pd.concat([anime_df_genre,anime_df_nogenre[['Name','Genres','Synopsis','Producers','Studios','summarized_synopsis']]])

In [None]:
anime_df.head()

Unnamed: 0,Name,Genres,Synopsis,Producers,Studios,summarized_synopsis
0,Cowboy Bebop,"[Fantasy and Supernatural, Action and Adventure]","Crime is timeless. By the year 2071, humanity ...",Bandai Visual,Sunrise,"These new societies are plagued by murder, dru..."
1,Cowboy Bebop: Tengoku no Tobira,"[Fantasy and Supernatural, Action and Adventure]","Another day, another bounty—such is the life o...","Sunrise, Bandai Visual",Bones,"Through their individual investigations, they ..."
2,Trigun,"[Fantasy and Supernatural, Action and Adventure]","Vash the Stampede is the man with a $$60,000,0...",Victor Entertainment,Madhouse,With his crazy doughnut obsession and buffooni...
3,Witch Hunter Robin,"[Drama and Mistery, Action and Adventure, Fant...",Robin Sena is a powerful craft user drafted in...,"Bandai Visual, Dentsu, Victor Entertainment, T...",Sunrise,Robin Sena is a powerful craft user drafted in...
4,Bouken Ou Beet,"[Fantasy and Supernatural, Action and Adventure]",It is the dark century and the people are suff...,"TV Tokyo, Dentsu",Toei Animation,It is the dark century and the people are suff...


134 animes are still without genres because the classifier did not predict any genre for those observations. So, we remove those animes from the dataframe.

In [21]:
anime_df = anime_df[anime_df['Genres'].notnull()].reset_index(drop=True)

In [None]:
def make_genres(list_genres):
  genres = ''
  for genre in list_genres:
    genres += genre + ',' + ' '
  return genres

In [None]:
anime_df['Genres'] = anime_df['Genres'].apply(lambda x: make_genres(x))

In [None]:
anime_df['Genres'] = anime_df['Genres'].str.replace(r',\s*$', '', regex = True)

In [23]:
anime_df

Unnamed: 0,Name,Genres,Synopsis,Producers,Studios,summarized_synopsis,Image URL
0,Cowboy Bebop,"Fantasy and Supernatural, Action and Adventure","Crime is timeless. By the year 2071, humanity ...",Bandai Visual,Sunrise,"These new societies are plagued by murder, dru...",https://cdn.myanimelist.net/images/anime/4/196...
1,Cowboy Bebop: Tengoku no Tobira,"Fantasy and Supernatural, Action and Adventure","Another day, another bounty—such is the life o...","Sunrise, Bandai Visual",Bones,"Through their individual investigations, they ...",https://cdn.myanimelist.net/images/anime/1439/...
2,Trigun,"Fantasy and Supernatural, Action and Adventure","Vash the Stampede is the man with a $$60,000,0...",Victor Entertainment,Madhouse,With his crazy doughnut obsession and buffooni...,https://cdn.myanimelist.net/images/anime/7/203...
3,Witch Hunter Robin,"Drama and Mistery, Action and Adventure, Fanta...",Robin Sena is a powerful craft user drafted in...,"Bandai Visual, Dentsu, Victor Entertainment, T...",Sunrise,Robin Sena is a powerful craft user drafted in...,https://cdn.myanimelist.net/images/anime/10/19...
4,Bouken Ou Beet,"Fantasy and Supernatural, Action and Adventure",It is the dark century and the people are suff...,"TV Tokyo, Dentsu",Toei Animation,It is the dark century and the people are suff...,https://cdn.myanimelist.net/images/anime/7/215...
...,...,...,...,...,...,...,...
9605,Overtake!,Sport and Slice of Life,Freelance photographer Kouya Madoka is in the ...,"Lantis, F.M.F, Kadokawa",TROYCA,While working on a story at the Fuji Internati...,https://cdn.myanimelist.net/images/anime/1916/...
9606,Jianzhen da Heshang,"Action and Adventure, Sport and Slice of Life","In 742 A.D., the first year of Xuantianbao in ...",,,"In terms of cultural influence, the architectu...",https://cdn.myanimelist.net/images/anime/1391/...
9607,Chang An San Wan Li,Action and Adventure,"A few years after the Anshi Rebellion, the Tib...",,Light Chaser Animation Studios,"A few years after the Anshi Rebellion, the Tib...",https://cdn.myanimelist.net/images/anime/1580/...
9608,Pon no Michi,"Action and Adventure, Comedy and Romance, Spor...","Nashiko Jippensha, a high school girl living i...","Bit grooove promotion, SUPA LOVE",OLM,When Naoko learned that the parlor that her fa...,https://cdn.myanimelist.net/images/anime/1796/...


## **Saving the Dataframe**

In [24]:
with open(path+'anime_summarized.pkl', 'wb') as f:
    anime_df.to_pickle(f)