# Keyword Extraction

In [45]:
import pandas as pd
import numpy as np

In [63]:
# Fetch preprocessed data
fe_data = pd.read_csv("data/preprocessed.csv")
fe_data.head()

Unnamed: 0,Id,Name,Language,Description,bow
0,1020396,the gospel of john,,what sets this commentary on the fourth gospel...,the_gospel_of_john francis_j._moloney michael...
1,1073868,hanslick on the musically beautiful: sixteen l...,,the sixteen lectures by geoffrey payzant in th...,hanslick_on_the_musically_beautiful:_sixteen_l...
2,1025976,microserfs,fre,génération x 1018 n° 2508 qui a connu un gros ...,microserfs douglas_coupland 10/18 fre 1997
3,1045943,courir avec des ciseaux,fre,roman autobiographique choc courir avec des ci...,courir_avec_des_ciseaux augusten_burroughs 10...
4,1027805,affinités,fre,pour tromper son ennui une demoiselle de la bo...,affinités sarah_waters 10/18 fre 2006


In [56]:
fe_data.shape[0] - fe_data.nunique()

Id                 0
Name              85
Language       34466
Description      435
bow                3
dtype: int64

---
### Consider only English books

In [64]:
# # Temporary process only english books
fe_data = fe_data[fe_data.Language.isin(["eng", "en-US", "en-GB"])].copy()

In [65]:
fe_data.drop("Language", axis=1, inplace=True)

---
### Extract keywords from description using keyBert

In [66]:
from keybert import KeyBERT
kw_model = KeyBERT()

def get_keywords(text):
    keywords = kw_model.extract_keywords(text, keyphrase_ngram_range=(1, 1), stop_words="english")
    keywords = " ".join([k[0] for k in keywords])
    return keywords

In [67]:
fe_data["keywords"] = fe_data.Description.apply(get_keywords)

In [69]:
fe_data["keywords"] = fe_data[['bow', 'keywords']].fillna('').agg(' '.join, axis=1)
fe_data.drop(['bow', 'Description'], axis = 1, inplace=True)

---
### Remove duplicated book names

In [70]:
fe_data[fe_data.duplicated(subset=["Name"], keep="first")]

Unnamed: 0,Id,Name,bow
259,1061925,the moon is a harsh mistress,the_moon_is_a_harsh_mistress robert_a._heinle...
1472,1050054,the anastasia syndrome and other stories,the_anastasia_syndrome_and_other_stories mary...
1563,1097912,india and tibet,india_and_tibet francis_younghusband asian_ed...
3235,1061929,the moon is a harsh mistress,the_moon_is_a_harsh_mistress robert_a._heinle...
3616,1061923,the moon is a harsh mistress,the_moon_is_a_harsh_mistress robert_a._heinle...
6466,1038822,blood and chocolate,blood_and_chocolate annette_curtis_klause cor...
7071,1039376,decline and fall,decline_and_fall evelyn_waugh csa_word en-GB ...
7574,1038823,blood and chocolate,blood_and_chocolate annette_curtis_klause del...
8167,1041886,the red and the black,the_red_and_the_black stendhal dover_publicat...
9045,1098850,decline and fall,decline_and_fall evelyn_waugh everyman's_libr...


In [71]:
fe_data = fe_data.drop_duplicates(subset=["Name"], keep='first')

---
### Save final dataset

In [None]:
fe_data.to_csv("data/keywords.csv", sep=",", index=False)