# Homework 1
***
## Importing Libraries

In [54]:
import pandas as pd
import nltk
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

## Reading Data

In [32]:
df = pd.read_csv('books_data.csv')
df.head()

Unnamed: 0,bookID,title,authors,average_rating
0,1,Harry Potter and the Half-Blood Prince (Harry ...,J.K. Rowling/Mary GrandPré,4.57
1,2,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling/Mary GrandPré,4.49
2,4,Harry Potter and the Chamber of Secrets (Harry...,J.K. Rowling,4.42
3,5,Harry Potter and the Prisoner of Azkaban (Harr...,J.K. Rowling/Mary GrandPré,4.56
4,8,Harry Potter Boxed Set Books 1-5 (Harry Potte...,J.K. Rowling/Mary GrandPré,4.78


## Data Exploration

In [33]:
df.tail(5)

Unnamed: 0,bookID,title,authors,average_rating
11122,45631,Expelled from Eden: A William T. Vollmann Reader,William T. Vollmann/Larry McCaffery/Michael He...,4.06
11123,45633,You Bright and Risen Angels,William T. Vollmann,4.08
11124,45634,The Ice-Shirt (Seven Dreams #1),William T. Vollmann,3.96
11125,45639,Poor People,William T. Vollmann,3.72
11126,45641,Las aventuras de Tom Sawyer,Mark Twain,3.91


## Data get shape information

In [34]:
df.shape

(11127, 4)

## Data check for null values

In [35]:
df.isnull().sum()

bookID            0
title             0
authors           0
average_rating    0
dtype: int64

## Get only 7000 samples and drop bookID column

In [36]:
df =df.sample(7000).drop(['bookID', 'average_rating'], axis=1).reset_index(drop=True)

In [37]:
print(df.shape)
df.head()

(7000, 2)


Unnamed: 0,title,authors
0,The Names,Don DeLillo
1,The Awakening (Vampire Huntress #2),L.A. Banks
2,Relentless Desire,Sandra Brown
3,Sometimes a Great Notion,Ken Kesey/Charles Bowden
4,Jojo's Bizarre Adventure Tome 6: Jojo contre ...,Hirohiko Araki


## Text Preprocessing

In [38]:
df['text'] = df['title'].str.lower().replace(r'^\w\s', ' ').replace(r'\n', ' ', regex = True)

## Create tokenizitation function

In [39]:
stemmer = PorterStemmer()

def tokenization(txt):
    tokens = nltk.word_tokenize(txt)
    stemming = [stemmer.stem(w) for w in tokens]
    return " ".join(stemming)

## Tokenize titles

In [40]:
df['text'] = df['text'].apply(lambda x: tokenization(x))

## Create TFIDF Matrix and calculate cosine similarity

In [42]:
tfidvector = TfidfVectorizer(analyzer='word',stop_words='english')
matrix = tfidvector.fit_transform(df['text'])
similarity = cosine_similarity(matrix)

## Check first row


In [43]:
similarity[0]

array([0., 0., 0., ..., 0., 0., 0.])

In [47]:
df[df['title'] == "Sometimes a Great Notion"]

Unnamed: 0,title,authors,text
3,Sometimes a Great Notion,Ken Kesey/Charles Bowden,sometim a great notion


## Create recomendation function

In [50]:
def recommendation(song_df):
    idx = df[df['title'] == song_df].index[0]
    distances = sorted(list(enumerate(similarity[idx])),reverse=True,key=lambda x:x[1])

    songs = []
    for m_id in distances[1:21]:
        songs.append(df.iloc[m_id[0]].title)

    return songs

## Recommendation for 'Sometimes a Great Notion'

In [51]:
recommendation('Sometimes a Great Notion')

['The Great World',
 'The Alchemist’s Kitchen: Extraordinary Potions & Curious Notions',
 'The Great Gatsby',
 'The Great Gatsby',
 'The Great Gatsby',
 'The Great Gatsby',
 'The Great Gatsby',
 'The Great Gatsby',
 'The Great American Novel',
 'The Great House of God',
 'Great Expectations',
 'Great Expectations',
 'Ten Great Works of Philosophy',
 'Great Short Works',
 'The Great Divorce',
 'The Great and Secret Show (Book of the Art #1)',
 'The Great And Secret Show (Book of the Art #1)',
 'Discovering Great Artists: Hands-On Art for Children in the Styles of the Great Masters',
 'Great Jones Street',
 'Welcome to the Great Mysterious']