## Run all - Check which data to use (practice,full, lemmatized) - Hyperparameters

In [585]:
reset -fs

In [586]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import pandas as pd
import numpy as np
import pickle

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

#sklearn
import sklearn
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

#visualizations
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

import spacy
import string

#### Function for displaying top words in topic

In [587]:
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for idx, topic in enumerate(model.components_):
        if not topic_names or not topic_names[idx]:
            print("\nTopic ", idx)
        else:
            print("\nTopic: '",topic_names[idx],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

#### Function for lemmatization

In [588]:
def lemmatization(texts, allowed_postags=["NOUN", "ADJ", "VERB", "ADV"]):
    nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
    texts_out = []
    for text in texts:
        doc = nlp(text)
        new_text = []
        for token in doc:
            if token.pos_ in allowed_postags:
                new_text.append(token.lemma_)
        final = " ".join(new_text)
        texts_out.append(final)
    
    return texts_out  

### Importing dataset as books_df

In [589]:
books_df = pd.read_csv('./books_with_blurbs.csv')

In [590]:
books_df.head()

Unnamed: 0,ISBN,Title,Author,Year,Publisher,Blurb
0,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,"Here, for the first time in paperback, is an o..."
1,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,"The fascinating, true story of the world's dea..."
2,399135782,The Kitchen God's Wife,Amy Tan,1991,Putnam Pub Group,Winnie and Helen have kept each others worst s...
3,425176428,What If?: The World's Foremost Military Histor...,Robert Cowley,2000,Berkley Publishing Group,Historians and inquisitive laymen alike love t...
4,1881320189,Goodbye to the Buttermilk Sky,Julia Oliver,1994,River City Pub,This highly praised first novel by fiction wri...


In [591]:
corpus = [x for x in books_df.Blurb]

In [592]:
#smaller corpus for practicing model building
practice_corpus = corpus[:400]

In [593]:
corpus_lower = []
lower_split = []
for text in corpus:
    corpus_lower.append(text.lower())
    lower_split.append(text.lower().split(" "))

In [594]:
# lemmatized_texts = lemmatization(corpus_lower)

In [595]:
practice_lower = corpus_lower[:400]

In [596]:
practice_lemmatized_text = lemmatization(practice_lower)

### Appending to stopword list

In [597]:
stop_words = nltk.corpus.stopwords.words('english')
stop_words.extend(['orson', 'scott', 'card','short','stories'])

## Change max_df, min_df and corpus to run with all documents  min_df=65, max_df=.05)

In [598]:
count_vectorizer = CountVectorizer(stop_words=stop_words, min_df=65, max_df=.05)
doc_word = count_vectorizer.fit_transform(corpus)

In [599]:
tfidf_vectorizer = TfidfVectorizer(**count_vectorizer.get_params())    
tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)



In [600]:
lda_tf = LatentDirichletAllocation(n_components=12, random_state=0)
lda_tf.fit(tfidf_matrix)

LatentDirichletAllocation(n_components=12, random_state=0)

In [601]:
pandas_cristo = pd.DataFrame(tfidf_matrix[6752])

In [602]:
type(lda_tf)

sklearn.decomposition._lda.LatentDirichletAllocation

In [603]:
pandas_cristo.head()

Unnamed: 0,0
0,"(0, 7439)\t0.14004132199774771\n (0, 7344)\..."


In [604]:
# lda_tf.save('models/lda_tf.model')

## practice on lemmatized text

In [605]:
# prac_lemmatized_tfidf = tfidf.fit_transform(practice_lemmatized_text)

In [606]:
# display_topics(lda_tf, tfidf.get_feature_names(), 15)   Did not include as many relevant topics
# didn't have cooking, French, German, Self help.....

### Topics and lists displayed

In [624]:
display_topics(lda_tf, tfidf_vectorizer.get_feature_names(), no_top_words=15, topic_names=['Cooking','German','Nazi/Horror','Buzzwords', 'Classic Literature/Fantasy', \
                                                                   'Mystery/Thriller', 'Self Improvement', 'Science/SciFi','How-To', \
                                                                   'Contemporary Fiction', 'French', 'Historical'])


Topic: ' Cooking '
recipes, collection, guide, food, includes, color, books, edition, cooking, photos, information, full, dishes, film, including

Topic: ' German '
und, der, die, den, ein, zu, sie, ist, von, das, mit, sich, er, auf, eine

Topic: ' Nazi/Horror '
vampire, nazi, jessie, german, jewish, holocaust, dracula, poland, hitler, vampires, camp, jews, concentration, killer, anita

Topic: ' Buzzwords '
purchase, released, available, 24, copies, jo, pages, 25, paperback, 2003, million, hardcover, 2002, 2001, 000

Topic: ' Classic Literature/Fantasy '
collection, fiction, published, classic, written, literary, edition, writers, tales, literature, award, volume, characters, poems, writer

Topic: ' Mystery/Thriller '
killer, dead, wife, case, mystery, husband, daughter, crime, dark, town, passion, soon, detective, truth, secrets

Topic: ' Self Improvement '
guide, offers, spiritual, self, practical, shows, god, use, personal, health, questions, learn, provides, today, information

To

### LDA Fit Transform for Document-Topic Matrix

In [608]:
transfor_lda_tf = LatentDirichletAllocation(n_components=12, random_state=0)
transformed_lda_matrix = transfor_lda_tf.fit_transform(tfidf_matrix)

In [609]:
transformed_lda_matrix.shape

(57510, 12)

In [610]:
lda_doc_topic_df = pd.DataFrame(transformed_lda_matrix, columns = ['Cooking','German','Nazi/Horror','Buzzwords', 'Classic Literature/Fantasy', \
                                                                   'Mystery/Thriller', 'Self Improvement', 'Science/SciFi','How-To', \
                                                                   'Contemporary Fiction', 'French', 'Historical'])

In [611]:
lda_doc_topic_df.head()

Unnamed: 0,Cooking,German,Nazi/Horror,Buzzwords,Classic Literature/Fantasy,Mystery/Thriller,Self Improvement,Science/SciFi,How-To,Contemporary Fiction,French,Historical
0,0.355381,0.019142,0.019142,0.019143,0.019143,0.019142,0.019143,0.453192,0.019143,0.019142,0.019142,0.019145
1,0.011183,0.011182,0.011182,0.011182,0.011182,0.011182,0.011184,0.876997,0.011182,0.011182,0.011182,0.011182
2,0.013579,0.013579,0.013579,0.013579,0.01358,0.85063,0.013579,0.013579,0.013579,0.013579,0.013579,0.01358
3,0.015376,0.015376,0.015376,0.015376,0.015377,0.015377,0.830862,0.015376,0.015376,0.015377,0.015376,0.015376
4,0.007992,0.007991,0.007992,0.007993,0.563376,0.007992,0.007992,0.007992,0.007992,0.356706,0.007991,0.007992


In [612]:
lda_doc_topic_df.to_csv('./lda_doc_topic.csv')

In [613]:
topic_document_df = lda_doc_topic_df.transpose()

In [614]:
topic_document_df.tail(12)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,57500,57501,57502,57503,57504,57505,57506,57507,57508,57509
Cooking,0.355381,0.011183,0.013579,0.015376,0.007992,0.010637,0.007387,0.0134,0.879637,0.016843,...,0.009308,0.009967,0.011783,0.01029,0.011747,0.013895,0.010491,0.009924,0.009845,0.018774
German,0.019142,0.011182,0.013579,0.015376,0.007991,0.010636,0.007386,0.013399,0.01094,0.016841,...,0.009308,0.009967,0.011782,0.01029,0.011746,0.013895,0.010491,0.009924,0.009845,0.018773
Nazi/Horror,0.019142,0.011182,0.013579,0.015376,0.007992,0.010636,0.007386,0.013399,0.01094,0.016842,...,0.009308,0.009967,0.011782,0.01029,0.011746,0.013895,0.010491,0.009925,0.009845,0.018773
Buzzwords,0.019143,0.011182,0.013579,0.015376,0.007993,0.010636,0.007386,0.013401,0.01094,0.016842,...,0.009308,0.009967,0.011782,0.01029,0.011746,0.013895,0.010491,0.009924,0.009845,0.018773
Classic Literature/Fantasy,0.019143,0.011182,0.01358,0.015377,0.563376,0.010637,0.394653,0.692872,0.010946,0.016843,...,0.009308,0.009967,0.011783,0.010293,0.011747,0.013895,0.010491,0.009925,0.009845,0.018774
Mystery/Thriller,0.019142,0.011182,0.85063,0.015377,0.007992,0.882997,0.531483,0.0134,0.010944,0.016842,...,0.839387,0.009967,0.011782,0.628665,0.011747,0.847152,0.884599,0.890829,0.009845,0.793483
Self Improvement,0.019143,0.011184,0.013579,0.830862,0.007992,0.010637,0.007387,0.173134,0.010942,0.016844,...,0.009308,0.459879,0.011783,0.010291,0.011747,0.013895,0.010491,0.009925,0.009845,0.018774
Science/SciFi,0.453192,0.876997,0.013579,0.015376,0.007992,0.010637,0.007386,0.013399,0.010941,0.016842,...,0.009309,0.440451,0.182254,0.010291,0.393426,0.013899,0.010492,0.009925,0.891707,0.018774
How-To,0.019143,0.011182,0.013579,0.015376,0.007992,0.010637,0.007386,0.013399,0.010942,0.016844,...,0.009308,0.009967,0.011783,0.01029,0.011746,0.013895,0.010491,0.009924,0.009845,0.018776
Contemporary Fiction,0.019142,0.011182,0.013579,0.015377,0.356706,0.010637,0.007387,0.0134,0.010945,0.814734,...,0.009308,0.009967,0.699923,0.010292,0.489109,0.013895,0.010491,0.009925,0.009845,0.018774


### Assigning topics to books

In [615]:
book_title_topic_df = pd.DataFrame(books_df.Title)

book_title_topic_df.head()

Unnamed: 0,Title
0,Decision in Normandy
1,Flu: The Story of the Great Influenza Pandemic...
2,The Kitchen God's Wife
3,What If?: The World's Foremost Military Histor...
4,Goodbye to the Buttermilk Sky


In [616]:
index_topic_dict = {}

for i in range(len(lda_doc_topic_df)):
    index_topic_dict[i] = lda_doc_topic_df.iloc[i].idxmax()


In [617]:
index_topic_series = pd.Series(index_topic_dict,index=index_topic_dict.keys())

In [618]:
book_title_topic_df['topic'] = index_topic_series
book_title_topic_df.head()

Unnamed: 0,Title,topic
0,Decision in Normandy,Science/SciFi
1,Flu: The Story of the Great Influenza Pandemic...,Science/SciFi
2,The Kitchen God's Wife,Mystery/Thriller
3,What If?: The World's Foremost Military Histor...,Self Improvement
4,Goodbye to the Buttermilk Sky,Classic Literature/Fantasy


In [619]:
book_title_topic_df.value_counts('topic')

topic
Mystery/Thriller              16878
Contemporary Fiction          11505
Classic Literature/Fantasy     8202
Self Improvement               6888
Science/SciFi                  5026
Cooking                        2333
Historical                     1966
French                         1790
How-To                         1433
German                          984
Buzzwords                       271
Nazi/Horror                     234
dtype: int64

### pyLDAvis visualization

In [620]:
pylda_model = pyLDAvis.sklearn.prepare(lda_tf, tfidf_matrix, tfidf_vectorizer)
pyLDAvis.save_html(pylda_model, 'lda.html')

  default_term_info = default_term_info.sort_values(


In [621]:
pyLDAvis.sklearn.prepare(lda_tf, tfidf_matrix, tfidf_vectorizer)

  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
