In [52]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [53]:
x = ["hello hello my name is Diana", "hello my name is Zarko"]

In [54]:
vec = CountVectorizer()
x_cv = vec.fit_transform(x)

In [55]:
vec.get_feature_names_out()

array(['diana', 'hello', 'is', 'my', 'name', 'zarko'], dtype=object)

In [56]:
x_cv.toarray()

array([[1, 2, 1, 1, 1, 0],
       [0, 1, 1, 1, 1, 1]])

In [57]:
pd.DataFrame( data = x_cv.toarray(), 
             columns = vec.get_feature_names_out() )

Unnamed: 0,diana,hello,is,my,name,zarko
0,1,2,1,1,1,0
1,0,1,1,1,1,1


In [58]:
vec = TfidfVectorizer()
x_tfidf = vec.fit_transform(x)

pd.DataFrame( data = x_tfidf.toarray(),
             columns = vec.get_feature_names_out() )

Unnamed: 0,diana,hello,is,my,name,zarko
0,0.469132,0.667582,0.333791,0.333791,0.333791,0.0
1,0.0,0.40909,0.40909,0.40909,0.40909,0.574962


$$\text{tf} = \frac{N(\text{token})}{N(\text{tokens})}$$

$$\text{IDF} = \frac{N(\text{doc})}{N(\text{doc with token})}$$

### To calcule the cosine how much they are close. Coseno

In [59]:
cosine_similarity(x_tfidf)  

array([[1.        , 0.68275315],
       [0.68275315, 1.        ]])

In [60]:
df = pd.read_csv('/Users/dianaterraza/Desktop/NLP/Data/Books.csv',
                 on_bad_lines='skip',
                 sep=';')

df.head()

Unnamed: 0,ISBN,Title,Author,Year,Publisher
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton & Company


Bild the similarity 

In [61]:
user_ratings = pd.read_csv('/Users/dianaterraza/Desktop/NLP/Data/users-ratings.csv')
user_ratings.head()

Unnamed: 0,User-ID,Age,ISBN,Rating
0,243,,60915544,10
1,243,,60977493,7
2,243,,156006529,0
3,243,,316096199,0
4,243,,316601950,9


In [62]:
df = df[df['ISBN'].isin(user_ratings['ISBN'])]

### 1. Preprocess the data 

In [63]:
df.dropna(subset = ['Title', 'Author'], inplace = True) 

In [64]:
df.drop_duplicates(subset = ['Title', 'Author'], inplace = True)

In [65]:
df['Title'] = df['Title'].apply(lambda s: s.lower())
df['Author'] = df['Author'].apply(lambda s: s.lower())

df['text'] = df['Title'] + ' ' + df['Author']
df.head()

Unnamed: 0,ISBN,Title,Author,Year,Publisher,text
18,440234743,the testament,john grisham,1999,Dell,the testament john grisham
19,452264464,beloved (plume contemporary fiction),toni morrison,1994,Plume,beloved (plume contemporary fiction) toni morr...
26,971880107,wild animus,rich shapero,2004,Too Far,wild animus rich shapero
27,345402871,airframe,michael crichton,1997,Ballantine Books,airframe michael crichton
28,345417623,timeline,michael crichton,2000,Ballantine Books,timeline michael crichton


In [66]:
df.isnull().sum()

ISBN         0
Title        0
Author       0
Year         0
Publisher    0
text         0
dtype: int64

In [67]:
df.reset_index(inplace=True, drop=True)

In [68]:
tfidf = TfidfVectorizer(stop_words = 'english')
tfidf_matrix = tfidf.fit_transform(df['text'])

### 2. Apply vectorizer to the text 

In [69]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['text'])

In [70]:
tfidf_matrix.shape

(1798, 3143)

### 3. Cosine similarity 

In [71]:
cosine_sim = cosine_similarity(tfidf_matrix)

In [72]:
cosine_sim[18].sum()

np.float64(1.0)

The function that will recommend us the books

Homework: pass the user_id and also book_id list of recommendatios excluding those that user has already read. 

In [None]:
def get_recommendations(isbn, cosine_sim = cosine_sim, top_n = 10):
  idx = df[df['ISBN']==isbn].index
  sim_scores = cosine_sim[idx][0]

  sim_scores = sorted(list(enumerate(sim_scores)), #enumerate the tuples (index,score)
                      key = lambda x: x[1], #lambda each tuple is sorted based on the second element (x[1], which is the similarity score)
                      reverse=True) # reverse the list so the highest score is first
  #tuples (index, score) are sorted based on the score

  sim_scores = sim_scores[1:top_n+1] #1 start for the next that we already readed. 
  book_indicies = [i[0] for i in sim_scores]

  return df.iloc[book_indicies]

content base approach to the recommendations 

In [74]:
df[df['ISBN'] == '0060987529']

Unnamed: 0,ISBN,Title,Author,Year,Publisher,text
726,60987529,confessions of an ugly stepsister : a novel,gregory maguire,2000,Regan Books,confessions of an ugly stepsister : a novel gr...


In [75]:
df[df['ISBN'] == '0440234743']

Unnamed: 0,ISBN,Title,Author,Year,Publisher,text
0,440234743,the testament,john grisham,1999,Dell,the testament john grisham


In [76]:
get_recommendations('0440234743')

Unnamed: 0,ISBN,Title,Author,Year,Publisher,text
61,0385497466,the brethren,john grisham,2000,Doubleday,the brethren john grisham
154,0385424728,the chamber,john grisham,1994,Doubleday Books,the chamber john grisham
155,0385472951,the partner,john grisham,1997,Doubleday Books,the partner john grisham
229,044022165X,the rainmaker,john grisham,1996,Dell,the rainmaker john grisham
325,0385510438,the last juror,john grisham,2004,Doubleday,the last juror john grisham
394,0440236673,the brethren,john grisham,2000,Island,the brethren john grisham
509,0385424736,the rainmaker,john grisham,1995,Doubleday Books,the rainmaker john grisham
64,0385511612,bleachers,john grisham,2003,Doubleday,bleachers john grisham
99,038542471X,the client,john grisham,1993,Doubleday Books,the client john grisham
114,044021145X,the firm,john grisham,1992,Bantam Dell Publishing Group,the firm john grisham
