# Content-Based Filtering and Feature Extraction

### 1. Import Libraries:

In [8]:
# Data Manipulation Libraries:
import numpy as np
import pandas as pd
# Data Visualization Libraries:
import matplotlib.pyplot as plt
import seaborn as sns
# Data Modeling Libraries:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import precision_score, recall_score, f1_score

### 2. Load the Data:

In [9]:
data = pd.read_csv('GoodReads_100k_books.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 13 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   author        100000 non-null  object 
 1   bookformat    96772 non-null   object 
 2   desc          93228 non-null   object 
 3   genre         89533 non-null   object 
 4   img           96955 non-null   object 
 5   isbn          85518 non-null   object 
 6   isbn13        88565 non-null   object 
 7   link          100000 non-null  object 
 8   pages         100000 non-null  int64  
 9   rating        100000 non-null  float64
 10  reviews       100000 non-null  int64  
 11  title         99999 non-null   object 
 12  totalratings  100000 non-null  int64  
dtypes: float64(1), int64(3), object(9)
memory usage: 9.9+ MB


In [4]:
data.head()

Unnamed: 0,author,bookformat,desc,genre,img,isbn,isbn13,link,pages,rating,reviews,title,totalratings
0,Laurence M. Hauptman,Hardcover,Reveals that several hundred thousand Indians ...,"History,Military History,Civil War,American Hi...",https://i.gr-assets.com/images/S/compressed.ph...,002914180X,9780000000000.0,https://goodreads.com/book/show/1001053.Betwee...,0,3.52,5,Between Two Fires: American Indians in the Civ...,33
1,"Charlotte Fiell,Emmanuelle Dirix",Paperback,Fashion Sourcebook - 1920s is the first book i...,"Couture,Fashion,Historical,Art,Nonfiction",https://i.gr-assets.com/images/S/compressed.ph...,1906863482,9780000000000.0,https://goodreads.com/book/show/10010552-fashi...,576,4.51,6,Fashion Sourcebook 1920s,41
2,Andy Anderson,Paperback,The seminal history and analysis of the Hungar...,"Politics,History",https://i.gr-assets.com/images/S/compressed.ph...,948984147,9780000000000.0,https://goodreads.com/book/show/1001077.Hungar...,124,4.15,2,Hungary 56,26
3,Carlotta R. Anderson,Hardcover,"""All-American Anarchist"" chronicles the life a...","Labor,History",https://i.gr-assets.com/images/S/compressed.ph...,814327079,9780000000000.0,https://goodreads.com/book/show/1001079.All_Am...,324,3.83,1,All-American Anarchist: Joseph A. Labadie and ...,6
4,Jean Leveille,,"Aujourdâ€™hui, lâ€™oiseau nous invite Ã sa ta...",,https://i.gr-assets.com/images/S/compressed.ph...,2761920813,,https://goodreads.com/book/show/10010880-les-o...,177,4.0,1,Les oiseaux gourmands,1


In [5]:
data.describe(include='O')

Unnamed: 0,author,bookformat,desc,genre,img,isbn,isbn13,link,title
count,100000,96772,93228,89533,96955,85518,88565.0,100000,99999
unique,68767,202,92499,72129,96955,85518,725.0,100000,97588
top,Mi-Ri Hwang,Paperback,This scarce antiquarian book is a facsimile re...,Nonfiction,https://i.gr-assets.com/images/S/compressed.ph...,313382077,9780000000000.0,https://goodreads.com/book/show/1731941.Scrap_...,Love in the Mask
freq,137,53855,77,486,1,1,49096.0,1,32


In [10]:
# Feature Extraction: TF-IDF on genres
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(data["genre"].fillna(""))  # Convert genre to TF-IDF vectors

In [11]:
# Save the TF-IDF matrix to CSV
pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf.get_feature_names_out()).to_csv("tfidf_matrix.csv", index=False)