### implementing TF-IDF
1. documentaion: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html


In [3]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
import re
from sklearn.feature_extraction.text import TfidfVectorizer


In [4]:
def review_to_words(raw_review):
    """Removes html tags, everything except letters, 
    and filters out stop words
    example how to use function: \n 
    for i in range(0, len(train['review'])): \n
        clean_train_reviews.append(review_to_words(train['review'][i])).

    Args:
        raw_review (_str_): _input the column you want to transform_

    Returns:
        _str_: _a string that is transformed_
    """
    #1 if any html tags, removed 
    review_text = BeautifulSoup(raw_review).get_text()

    #2 remove puctions and numbers
    letters_only = re.sub("[^a-zA-Z]", " ", review_text)

    #3 convert to lowercase and split
    words_lst = letters_only.lower().split()

    #4 convert stop words to set for increased speed processing
    stops = set(stopwords.words("english"))

    #5 remove stop words from the text
    meaningful_words = [w for w in words_lst if not w in stops] #if w in stops remove it

    #6 transform the list to text string
    meaningful_words_str = " ".join(meaningful_words)

    return meaningful_words_str

In [5]:
def clean_data(filepath):
    """ reads csv file to a dataframe and cleans review column

    Args:
        filepath (_str_): _filepath of the csv file_

    Returns:
        _list_: _returns a list with clean text with no stopwords_
    """
    df_orig = pd.read_csv(filepath)

    clean_review = []

    for line in range(0, len(df_orig['reviewText'])):
        clean_review.append(review_to_words(df_orig['reviewText'][line]))
    return clean_review
clean_data("Books_5_partition_1.csv")





['king mice cheese nancy gurney excellent children book one well remember childhood purchased daughter loves king trouble rude mice eating cheese consults wise men suggest cats chase away mice cats become nuisance wise men recommend king bring dogs chase cats away cycle goes mice finally brought back chase away elephants brought chase away lions chased away dogs story ends compromise friendship mice king story also teaches cause effect relationships pictures accompany story humorous memorable thrilled discover back print highly recommend children ages',
 'years later cheese government cheese mice objected king idea good manners species centric rebelled king blamed peasants forbade keep cats chase mice homes made things worse peasants could afford moved far away mice possible wait next chapter',
 'looking louis untermeyer book one',
 'lovely husband looking remembered',
 'rated five first book used introduce year old son poetry custard magic dragon read still remembers day charming mix 

In [6]:
# finding how many different words there are in the corpus
def words_in_corpus(clean_text):
    """finding how many different words there are in the corpus
    Args:
        clean_text (_list_): _clean-reviews_
    Returns:
        _str_: _number of unique words in the corpus_
    """     

    words_set = set()
    for i in clean_text:
        words = i.split(' ')
        # print(words)
        words_set = words_set.union(set(words))

    return f'number of words in the corpus {len(words_set)}'

words_in_corpus(clean_data("Books_5_partition_1.csv"))



'number of words in the corpus 13389'

In [7]:
vectorizer = TfidfVectorizer(
    max_features= 1000, # Selects most frequent words in the corpus when computing the TF-IDF. useful for performance if you have large datasets
    # max_df=  0.8, # removes words that appears 80% in the text.
    min_df = 5, # removes word that appears less than 5 times
    ngram_range= (1,3) #is range to capture the conext and meaning of words. means it checks 3 words at a time.
)

In [8]:
vectors = vectorizer.fit_transform(clean_data("Books_5_partition_1.csv")) #use the function we made above to get clean data 
# print(vectors)

feature_names = vectorizer.get_feature_names_out() #feature names that are most frequent. you can changes this in the max_feature parameter when using TfidfVectorizer
# print(feature_names)

dense = vectors.toarray() # returns a sparse matrix with shape (rows * feature_names)
# print(dense)

denselist = dense.tolist()

# print("vocubulary:", vectorizer.vocabulary_) #prints a dictionary counting number of times a feature appears



In [9]:
# This loop goes through every line that is tf-idf'ed and extracts values above 0,
# wich means words that have occured inside our chosen vocabulary
# in the end we get words that is in our vocubulary and in the review
all_keywords = []
for description in denselist:
    x=0
    keywords = []
    for word in description:
        
        if word > 0:
            keywords.append(feature_names[x])
            # print(feature_names[x])


        x=x+1
    all_keywords.append(keywords)


In [11]:

#an example 
print(all_keywords[1])
print()
example_2 = clean_data("Books_5_partition_1.csv")
print(example_2[1])

['away', 'chapter', 'could', 'far', 'good', 'government', 'idea', 'keep', 'king', 'later', 'made', 'next', 'possible', 'things', 'wait', 'worse', 'years']





years later cheese government cheese mice objected king idea good manners species centric rebelled king blamed peasants forbade keep cats chase mice homes made things worse peasants could afford moved far away mice possible wait next chapter


In [None]:
from sklearn.cluster import KMeans

vocab = vectorizer.vocabulary_
print(len(vocab))