# Tags

In [1]:
import pandas as pd
import numpy as np
import collections

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import scipy
import math
import random
import sklearn
import string
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split

import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /Users/cinny/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
def rmse(y,h):
    """RMSE
    Args:
        y: real y
        h: predicted y
    Returns:
        RMSE
    """
    a = y-h

    return np.sqrt(sum(a**2)/len(a))

In [3]:
df1 = pd.read_csv('books/Ratings.csv')
df1.columns = ['uid', 'isbn', 'rating']
df1.head(3)

Unnamed: 0,uid,isbn,rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0


In [4]:
df2 = pd.read_csv('books/Books_tags.csv', index_col=0)
df2.head(3)

Unnamed: 0,isbn,title,author,year,publisher,tag_name
0,067976397X,Corelli's Mandolin : A Novel,LOUIS DE BERNIERES,1995,Vintage,to-read
1,067976397X,Corelli's Mandolin : A Novel,LOUIS DE BERNIERES,1995,Vintage,favorites
2,067976397X,Corelli's Mandolin : A Novel,LOUIS DE BERNIERES,1995,Vintage,currently-reading


In [5]:
df = df1.merge(df2, on='isbn')
df.head(3)

Unnamed: 0,uid,isbn,rating,title,author,year,publisher,tag_name
0,276925,067172939X,0,Dark Angel (Casteel),V.C. Andrews,1990,Pocket,to-read
1,276925,067172939X,0,Dark Angel (Casteel),V.C. Andrews,1990,Pocket,favorites
2,276925,067172939X,0,Dark Angel (Casteel),V.C. Andrews,1990,Pocket,currently-reading


In [6]:
train, test = train_test_split(df, test_size=0.20, random_state=42)

In [7]:
## DEFINING THE TAIL
tailcomp = df.groupby(by='isbn', as_index=False).agg({'rating':pd.Series.count}).sort_values(by = 'rating', ascending = False)
tot = sum(tailcomp['rating'])
tailcomp['popshare']= [x/tot for x in tailcomp['rating']]
tailcomp['popshare']= tailcomp['popshare'].cumsum()
tailcomp['category']= ['Head' if x<0.95 else "Tail" for x in tailcomp['popshare']]

tail = tailcomp.loc[tailcomp.popshare >= 0.95]
tail

Unnamed: 0,isbn,rating,popshare,category
113,044022750X,700,0.950005,Tail
366,1931514585,700,0.950649,Tail
265,156389405X,700,0.951294,Tail
370,1931514941,700,0.951938,Tail
162,067973452X,700,0.952583,Tail
...,...,...,...,...
323,1578562589,100,0.999632,Tail
221,1400041988,100,0.999724,Tail
320,1576751740,100,0.999816,Tail
222,1400045371,100,0.999908,Tail


In [8]:
def get_words(message):
    """Get the normalized list of words from a message string.

    This function should split a message into words, normalize them, and return
    the resulting list. For splitting, you should split on spaces. For normalization,
    you should convert everything to lowercase.

    Args:
        message: A string containing an SMS message

    Returns:
       The list of normalized words from the message.
    """


    words = message
    words = words.split(" ")
    words = [x.lower() for x in words]

    return words

In [9]:
def create_dictionary(messages):
    """Create a dictionary mapping words to integer indices.

    Args:
        messages: A list of strings containing SMS messages

    Returns:
        A python dict mapping words to integers.
    """

    word_counts = collections.defaultdict(int)

    for message in messages:
        for word in set(get_words(message)):
            word_counts[word] += 1

    resulting_dictionary = {}

    for word, count in word_counts.items():
        if count >= 10 and word not in stopwords.words('english') and len(word) > 1:
            next_index = len(resulting_dictionary)
            resulting_dictionary[word] = next_index

    return resulting_dictionary

In [10]:
def transform_text(messages, word_dictionary):
    """Transform a list of text messages into a numpy array for further processing.

    Args:
        messages: A list of strings where each string is an SMS message.
        word_dictionary: A python dict mapping words to integers.

    Returns:
        A numpy array marking the words present in each message.
        Where the component (i,j) is the number of occurrences of the
        j-th vocabulary word in the i-th message.
    """

    A = np.zeros((len(messages), len(word_dictionary)))

    for i, message in enumerate(messages):
        for word in get_words(message):
            if word in word_dictionary:
                A[i, word_dictionary[word]] += 1

    return A

In [11]:
df['tag_name'] = df['tag_name'].str.replace(r'[^\w\s]','')
df['tag_name'] = df['tag_name'].str.replace('-',' ')
df['words'] = df['tag_name'] +' '+df['author']
word_dict = create_dictionary(df['words'])
#word_dict

In [12]:
len(word_dict)

4038

In [13]:
A = transform_text(df['words'], word_dict)

In [14]:
np.size(A, 0)

1086100

In [15]:
np.sum(A, axis=1).min()

1.0

In [16]:
A1 = np.sum((A>0), axis= 0)

IDF = np.log(np.size(A, 0)/A1)
IDF
len(IDF)

4038

In [17]:
TF = A / (np.sum(A, axis=1, keepdims=True))
np.shape(TF)

(1086100, 4038)

In [18]:
TFiDF = TF*IDF
np.shape(TFiDF)

(1086100, 4038)

In [19]:
# ids = np.argsort(TFiDF.mean(axis=1))[:5]
# reverse_dictionary = {i: word for word, i in word_dict.items()}
# [reverse_dictionary[i] for i in ids]

In [20]:
type(TFiDF)

numpy.ndarray

In [21]:
TFiDF

array([[1.78056332, 1.78056332, 1.53478063, ..., 0.        , 0.        ,
        0.        ],
       [1.78056332, 1.78056332, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [1.78056332, 1.78056332, 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 4.64646684,
        4.64646684],
       [0.        , 0.        , 0.        , ..., 0.        , 4.64646684,
        4.64646684],
       [0.        , 0.        , 0.        , ..., 0.        , 4.64646684,
        4.64646684]])

In [None]:
TFiDF = TFiDF / np.sqrt((np.sum(TFiDF**2, axis = 1, keepdims=True)+0.01))

In [None]:
SimC = np.dot(TFiDF, TFiDF.T)
SimC

In [None]:
np.sum(SimC, axis = 1).max()

In [None]:
np.fill_diagonal(SimC, 1)

In [None]:
BookSim =pd.DataFrame(SimC, columns=df['title'], index=df['title'])
BookSim.head(3)

In [None]:
#np.argsort(SimC[53, :])[-5:]
df.title[np.argsort(SimC[15, :])[-6:]]
#Sim20 = pd.DataFrame(Sim20, columns=finalbooks.title[101:200], index=finalbooks.title[50:60])

In [None]:
SimC[15, [15, 2252, 6977, 4642, 2796, 1700 ]]

In [None]:
df[df['isbn'].isin(np.argsort(SimC[15, :])[-6:]+1)]

In [None]:
# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(27.5, 22.5))

# Generate a custom diverging colormap followed by the correlation heatmap
cmap =sns.diverging_palette(20, 220, n=20000)

sns.heatmap(BookSim, cmap=cmap,center = 0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

In [None]:
train

In [None]:
allpreds = []
train = train.sort_values(by=['isbn'])
for i in range(15000):
  bi = train['isbn'][train['uid'] == i+1]-1
  Simi = SimC[:, bi]
  ri = np.array(train[train['uid'] == i+1].sort_values(by=['isbn']).rating)
  predi = finalbooks.filter(['isbn'])
  predi['pred'] = np.sum(Simi*ri, axis=1)/(np.sum(Simi, axis=1)+0.01)
  predi['uid'] = i+1
  allpreds.append(predi)
  if (i+1)%1000 == 0: print("done: ", i+1)

In [None]:
predictions = np.concatenate(allpreds, axis=0 )

In [None]:
final = pd.DataFrame(predictions, columns=['isbn', 'pred', 'uid'])
final.head(3)

In [None]:
train['conc']=train['uid'].map(str)+train['isbn'].map(str)
final['conc']=final['uid'].map(str)+final['isbn'].map(str)
finalfin = final[~final.conc.isin(train.conc)]
finalfin.describe()

In [None]:
finalrank = test.merge(final, on=['isbn', 'uid'])
finalrank = finalrank.sort_values(by=['uid', 'pred'], ascending=False)
finalrank.head(5)

In [None]:
finallist = []
for i in range(15000):
    a = finalrank.loc[finalrank['uid'] == i+1]['rating'].tolist()
    finallist.append(a)
    if (i+1)%1000 == 0: print("done: ", i+1)

In [None]:
b = np.array([rmse(r, len(r)) for r in finallist])

facet, axes = plt.subplots(1, 1, figsize=(10, 3))
n, bins, patches = plt.hist(b, 200, facecolor='blue', alpha=0.5) #, log = True)   
plt.title('Distribution of NDGC among Users for the TFiDF model')
plt.show()

In [None]:
d = b[b == 1]
sum(d)/15000

In [None]:
top10 = finalfin.sort_values('pred',ascending = False).groupby('uid').head(10)
top50 = finalfin.sort_values('pred',ascending = False).groupby('uid').head(50)

print('RMSE: ', np.round(rmse(finalrank['pred'],finalrank['rating']), decimals=3))