# Keyword Extraction

In [1]:
import pandas as pd
import numpy as np

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
cd /content/drive/MyDrive/nlp-for-book-recommendation-main/nlp-for-book-recommendation-main/

/content/drive/MyDrive/nlp-for-book-recommendation-main/nlp-for-book-recommendation-main


In [5]:
# Fetch preprocessed data
fe_data = pd.read_csv("data/preprocessed.csv", usecols=["Id", "Name", "Language", "Description", "bow"])
fe_data.head()

Unnamed: 0,Id,Name,Language,Description,bow
0,1020396,the gospel of john,eng,what sets this commentary on the fourth gospel...,francis_j._moloney michael_glazier eng
1,1073868,hanslick on the musically beautiful: sixteen l...,eng,the sixteen lectures by geoffrey payzant in th...,geoffrey_payzant 1-877275-49-2 eng
2,1025976,microserfs,fre,génération x 1018 n° 2508 qui a connu un gros ...,douglas_coupland 10/18 fre
3,1045943,courir avec des ciseaux,fre,roman autobiographique choc courir avec des ci...,augusten_burroughs 10/18 fre
4,1027805,affinités,fre,pour tromper son ennui une demoiselle de la bo...,sarah_waters 10/18 fre


In [6]:
# Find duplicated values
fe_data.shape[0] - fe_data.nunique()

Unnamed: 0,0
Id,0
Name,85
Language,34257
Description,435
bow,2194


---
### Consider only English books

In [7]:
# # Temporary process only english books
fe_data = fe_data[fe_data.Language.isin(["eng", "en-US", "en-GB"])].copy()

In [8]:
pip install keybert

Collecting keybert
  Downloading keybert-0.8.5-py3-none-any.whl.metadata (15 kB)
Downloading keybert-0.8.5-py3-none-any.whl (37 kB)
Installing collected packages: keybert
Successfully installed keybert-0.8.5


---
### Extract keywords from description using keyBERT

In [9]:
from keybert import KeyBERT
kw_model = KeyBERT()

def get_keywords(text):
    keywords = kw_model.extract_keywords(text, keyphrase_ngram_range=(1, 1), stop_words="english")
    keywords = " ".join([k[0] for k in keywords])
    return keywords

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [10]:
fe_data["keywords"] = fe_data.Description.apply(get_keywords)

In [11]:
fe_data.keywords.head()

Unnamed: 0,keywords
0,gospel narrative moloneys fourth commentary
1,musikalischschnen hanslicks hanslick musically...
6,nun memoir wars jewish war
8,nursery rockinghorse kentuckians anglosaxon ba...
9,railway father engineer perseverance machinery


In [12]:
fe_data["keywords"] = fe_data[['bow', 'keywords']].fillna('').agg(' '.join, axis=1)
fe_data.drop(['bow', 'Description'], axis = 1, inplace=True)

---
### Remove duplicated book names

In [13]:
fe_data[fe_data.duplicated(subset=["Name"], keep="first")]

Unnamed: 0,Id,Name,Language,keywords
1492,1078605,rising to the occasion,eng,linda_taylor arrow_books_ltd eng novel cathy ...
2941,1078370,ainsley harriott's low fat meals in minutes,eng,ainsley_harriott bbc_books eng ainsleys ainsl...
3235,1061929,the moon is a harsh mistress,eng,robert_a._heinlein berkley_medallion eng lehr...
3310,1061015,winds of autumn,eng,janette_oke bethany_house_publishers eng josh...
3616,1061923,the moon is a harsh mistress,eng,robert_a._heinlein blackstone_publishing eng ...
...,...,...,...,...
31859,1001528,natasha and other stories,eng,david_bezmozgis vintage eng bermans bella ber...
32246,1083746,among the thugs,eng,bill_buford w._w._norton_&_company eng buford...
33626,1041880,the red and the black,eng,stendhal wordsworth_editions eng priesthood a...
34093,1088848,the duke of flatbush,eng,duke_snider zebra eng baseballs dodgers baseb...


In [14]:
fe_data = fe_data.drop_duplicates(subset=["Name"], keep='first')

---
### Save final dataset

In [15]:
fe_data.to_csv("data/keywords.csv", sep=",", index=False)