# Keyword Extraction

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Fetch preprocessed data
fe_data = pd.read_csv("data/preprocessed.csv")
fe_data.head()

Unnamed: 0,Id,Name,PublishYear,Language,Description,Title,bow
0,1020396,the gospel of john,1998,,what sets this commentary on the fourth gospel...,the_gospel_of_john,francis_j._moloney michael_glazier
1,1073868,hanslick on the musically beautiful: sixteen l...,2003,,the sixteen lectures by geoffrey payzant in th...,hanslick_on_the_musically_beautiful:_sixteen_l...,geoffrey_payzant 1-877275-49-2
2,1025976,microserfs,1997,fre,génération x 1018 n° 2508 qui a connu un gros ...,microserfs,douglas_coupland 10/18 fre
3,1045943,courir avec des ciseaux,2006,fre,roman autobiographique choc courir avec des ci...,courir_avec_des_ciseaux,augusten_burroughs 10/18 fre
4,1027805,affinités,2006,fre,pour tromper son ennui une demoiselle de la bo...,affinités,sarah_waters 10/18 fre


In [3]:
fe_data.shape[0] - fe_data.nunique()

Id                 0
Name              85
PublishYear    34403
Language       34466
Description      435
Title            244
bow             2433
dtype: int64

---
### Consider only English books

In [4]:
# # Temporary process only english books
fe_data = fe_data[fe_data.Language.isin(["eng", "en-US", "en-GB"])].copy()

In [5]:
fe_data.drop(["Language", "PublishYear", "Title"], axis=1, inplace=True)

---
### Extract keywords from description using keyBERT

In [7]:
from keybert import KeyBERT
kw_model = KeyBERT()

def get_keywords(text):
    keywords = kw_model.extract_keywords(text, keyphrase_ngram_range=(1, 1), stop_words="english")
    keywords = " ".join([k[0] for k in keywords])
    return keywords

In [8]:
fe_data["keywords"] = fe_data.Description.apply(get_keywords)

In [10]:
fe_data.keywords.head()

12                  memphis egypt delta governor thebes
15               proverb picket bells christmas stories
21                      emma paris french shes sullivan
25    moomintroll comet moominvalley adventures adve...
27                     acheron greeks trojan troy helen
Name: keywords, dtype: object

In [11]:
fe_data["keywords"] = fe_data[['bow', 'keywords']].fillna('').agg(' '.join, axis=1)
fe_data.drop(['bow', 'Description'], axis = 1, inplace=True)

---
### Remove duplicated book names

In [12]:
fe_data[fe_data.duplicated(subset=["Name"], keep="first")]

Unnamed: 0,Id,Name,keywords
3235,1061929,the moon is a harsh mistress,robert_a._heinlein berkley_medallion eng lehr...
3616,1061923,the moon is a harsh mistress,robert_a._heinlein blackstone_publishing eng ...
7574,1038823,blood and chocolate,annette_curtis_klause delacorte_press eng wer...
9045,1098850,decline and fall,evelyn_waugh everyman's_library eng librarian...
9139,1023491,the remains of the day,kazuo_ishiguro faber_and_faber_ltd. eng steve...
11884,1037476,the great and secret show (book of the art #1),book_of_the_art_#1 clive_barker harpercollins_...
11969,1072552,the tiger who came to tea,judith_kerr harpercollinschildren’sbooks eng ...
12558,1086403,something for the weekend (leo street #1),leo_street_#1 pauline_mclynn headline eng leo ...
20296,1004432,"playing with fire (inspector banks, #14)","inspector_banks,_#14 peter_robinson pan_macmil..."
21653,1050062,the anastasia syndrome and other stories,mary_higgins_clark pocket_books eng anastasia...


In [13]:
fe_data = fe_data.drop_duplicates(subset=["Name"], keep='first')

---
### Save final dataset

In [14]:
fe_data.to_csv("data/keywords.csv", sep=",", index=False)