### implementing TF-IDF
1. documentaion: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html


In [None]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
import re
from sklearn.feature_extraction.text import TfidfVectorizer


In [None]:
#read csv file
df_read = pd.read_csv("Books_5_partition_1.csv")

In [None]:
def review_to_words(raw_review):
    """Removes html tags, everything except letters, 
    and filters out stop words
    example how to use function: \n 
    for i in range(0, len(train['review'])): \n
        clean_train_reviews.append(review_to_words(train['review'][i])).

    Args:
        raw_review (_str_): _input the column you want to transform_

    Returns:
        _str_: _a string that is transformed_
    """
    #1 if any html tags, removed 
    review_text = BeautifulSoup(raw_review).get_text()

    #2 remove puctions and numbers
    letters_only = re.sub("[^a-zA-Z]", " ", review_text)

    #3 convert to lowercase and split
    words_lst = letters_only.lower().split()

    #4 convert stop words to set for increased speed processing
    stops = set(stopwords.words("english"))

    #5 remove stop words from the text
    meaningful_words = [w for w in words_lst if not w in stops] #if w in stops remove it

    #6 transform the list to text string
    meaningful_words_str = " ".join(meaningful_words)

    return meaningful_words_str

In [None]:
def clean_data(filepath):
    """ reads csv file to a dataframe and cleans review column

    Args:
        filepath (_str_): _filepath of the csv file_

    Returns:
        _list_: _returns a list with clean text with no stopwords_
    """
    df_orig = pd.read_csv(filepath)

    clean_review = []

    for line in range(0, len(df_orig['reviewText'])):
        clean_review.append(review_to_words(df_orig['reviewText'][line]))
    return clean_review
clean_data("Books_5_partition_1.csv")



In [None]:
# finding how many different words there are in the corpus
def words_in_corpus(clean_text):
    """finding how many different words there are in the corpus
    Args:
        clean_text (_list_): _clean-reviews_
    Returns:
        _str_: _number of unique words in the corpus_
    """     

    words_set = set()
    for i in clean_text:
        words = i.split(' ')
        # print(words)
        words_set = words_set.union(set(words))

    return f'number of words in the corpus {len(words_set)}'

words_in_corpus(clean_data("Books_5_partition_1.csv"))

In [None]:
vectorizer = TfidfVectorizer(
    max_features= 1000, # Selects most frequent words in the corpus when computing the TF-IDF. useful for performance if you have large datasets
    # max_df=  0.8, # removes words that appears 80% in the text.
    min_df = 5, # removes word that appears less than 5 times
    ngram_range= (1,3), #is range to capture the conext and meaning of words. means it checks 3 words at a time.
    get_params=True
)

In [None]:
vectors = vectorizer.fit_transform(clean_data("Books_5_partition_1.csv")) #use the function we made above to get clean data 
# print(vectors)

feature_names = vectorizer.get_feature_names_out() #feature names that are most frequent. you can changes this in the max_feature parameter when using TfidfVectorizer
# print(feature_names)

dense = vectors.toarray() # returns a sparse matrix with shape (rows * feature_names)
# print(dense)

denselist = dense.tolist()

# print("vocubulary:", vectorizer.vocabulary_) #prints a dictionary counting number of times a feature appears

### Get TF-IDF scores sorted
##### extract the words with highest tf-idf score

In [None]:
#get top words 
def top_words(top_rows):
    """ Returns a DF with sorted tf-idf score
    Args:
        select_range (_int_): _select top rows you want_
    Returns:
        _DataFrame_: _Returns a DataFrame with the top rows selected_
    """
    #sum up the number of each vocabulary word.
    dist = np.sum(dense, axis=0) #axis 0 to sum all the rows

    word_count = []
    for tag, count in zip(feature_names, dist):
        word_count.append((count, tag))

    df_word_count = pd.DataFrame(word_count, columns=["tf-idf-score", "word"])

    df_sorted = df_word_count.sort_values(by=['tf-idf-score'], ascending=False)
    
    return df_sorted[:top_rows]

top_words(top_rows=10)

Koden under er treg og må forbedres 
men per nå funker den som den er 

In [None]:
# This loop goes through every line that is tf-idf'ed and extracts values above 0,
# wich means words that have occured inside our chosen vocabulary
# in the end we get words that is in our vocubulary and in the review
all_keywords = []
hottest_word = [] #stores the word with highest tf-idf score for the document
for description in denselist:
    x=0
    keywords = []
    hottest_key = []
    hottested_word = max(description)
   
    
    for word in description:
        if word > 0:
            keywords.append(feature_names[x])
        
        if word == hottested_word:
            hottest_key.append(feature_names[x])

        x=x+1
  
    hottest_word.append(hottest_key)
    all_keywords.append(keywords)

# print(hottest_word)

eksempel med numpy, ikke helt ferdig

In [None]:
nonzero_indices  = np.nonzero(dense)
# print(no_zeros[0])

np_all_keywords = [feature_names[i] for i in nonzero_indices[1]]
print(np_all_keywords)




hottest_word_indices = np.argmax(dense, axis=1)
print(hottest_word_indices)
#gives you the hottest word
np_hottest_word = [feature_names[index] for index in hottest_word_indices]
print(len(np_hottest_word))

In [None]:
print(len(all_keywords))
print(len(df_read['reviewText']))
print(len(hottest_word))

### Result:
we get a column with the review text, and the tf-idf words and the hottest word

In [None]:
list_rev_tf_idf = []
for raw, tf_idf, hottest_w in zip(df_read['reviewText'], all_keywords, hottest_word):
    list_rev_tf_idf.append((raw, tf_idf, hottest_w))



df_tf_idf = pd.DataFrame(list_rev_tf_idf, columns=['reviewText', 'tf-idf-summary', 'hottest_word'])
# test = df_tf_idf[df_tf_idf['tf-idf-summary'] != None]

# print(df_tf_idf[df_tf_idf['hottest_word'] == 'able,action,actually,adult,adults,adventure,adventures,age,ago,almost,along,already,also,although,always,amazon,american,animals,another,anyone,anything,around,aslan,author,authors,away,back,bad,based,battle,beautiful,become,becomes,beginning,behavior,believe,bell,best,better,big,bit,book,book read,books,boring,bought,boy,buy,called,came,care,case,caspian,cat,certainly,chapter,chapters,character,character development,characters,chee,child,children,christian,chronicles,chronicles narnia,classic,clear,come,comes,complete,completely,computer,copy,could,couple,course,cover,crichton,daughter,dawn,dawn treader,day,days,death,decided,definitely,description,desert,development,dialogue,different,difficult,digory,disappointed,done,dr,dr seuss,easy,edition,edmund,either,else,end,ending,english,enjoy,enjoyable,enjoyed,enough,entertaining,entire,especially,etc,eustace,even,events,ever,every,everyone,everything,evil,example,excellent,except,expect,expected,fact,family,fan,fantasy,far,fast,favorite,feel,felt,fiction,finally,find,finish,finished,first,follow,form,found,four,friend,friends,full,fun,gave,get,gets,getting,gift,girl,give,given,giving,go,god,goes,going,gone,good,good book,got,great,green,group,guess,guy,half,hand,happen,happened,happy,hard,help,hemingway,hero,high,highly,hillerman,historical,history,home,hope,however,human,idea,ideas,illustrations,important,information,instead,interest,interested,interesting,jack,job,jordan,julia,jurassic,jurassic park,keep,kept,kid,kids,kind,kindle,king,know,land,language,large,last,last battle,later,leaphorn,learn,least,left,less,let,level,lewis,library,life,like,liked,line,lion,lion witch,lion witch wardrobe,literature,little,live,lives,long,look,looking,lost,lot,love,loved,lucy,made,magic,magical,magician,magician nephew,main,main character,make,makes,making,man,management,manager,many,may,maybe,mean,message,michael,michael crichton,middle,might,mind,minute,modern,money,movie,mr,much,must,mystery,name,nano,nanotechnology,narnia,narrative,nature,need,nephew,never,new,next,nice,nothing,novel,novels,obvious,often,oh,ok,old,older,one,one minute,opinion,order,original,others,overall,page,pages,parents,park,part,particular,particularly,parts,past,people,perhaps,person,peter,pick,picture,pictures,place,plot,point,polly,present,pretty,prey,prince,prince caspian,probably,problem,problems,protagonist,published,put,quality,quickly,quite,rather,read,read book,reader,readers,reading,reading book,reads,real,really,reason,recommend,remember,research,rest,return,review,reviews,right,robert,robert jordan,said,save,saw,say,says,scene,scenes,school,science,scientific,second,see,seem,seemed,seems,seen,self,sense,series,set,setting,seuss,seven,several,short,side,simple,simply,since,slow,small,someone,something,sometimes,somewhat,son,soon,spanish,star,stars,start,started,starts,stay,still,stories,story,strange,stupid,style,sure,susan,suspense,swarm,take,takes,tale,talk,talking,technical,technology,tell,tells,text,th,thing,things,think,thinking,third,though,thought,three,throughout,time,timeline,times,title,today,together,told,took,totally,treader,true,truly,try,trying,turn,two,type,uncle,understand,unfortunately,us,use,used,using,usual,version,want,wanted,war,wardrobe,way,ways,well,well written,went,whole,wife,wish,witch,witch wardrobe,within,without,woman,wonder,wonderful,word,words,work,works,world,worlds,worth,would,write,writer,writing,written,wrong,wrote,year,year old,years,yes,yet'])
# test[test['reviewText'] == 'thanks!']
df_tf_idf.dtypes

# df_tf_idf_filtered.to_csv('controllingdata.csv')

# print(df_tf_idf['reviewText'][0])
# print()
# print(df_tf_idf['tf-idf-summary'][0])
# print()
# print(df_tf_idf['hottest_word'][0])

## KMeans Clustering (stretch)
##### maybe it can be useful

In [None]:
from sklearn.cluster import KMeans

true_k = 20

model = KMeans(
    n_clusters=true_k, 
    init="k-means++", 
    max_iter= 100
    )
model.fit(vectors)

order_centroids = model.cluster_centers_.argsort()[:, ::-1]
print(order_centroids)

In [None]:
with open("cluster.txt", "w", encoding="utf-8") as f:
    for i in range(true_k):
        f.write(f"Cluster {i}")
        f.write("\n")
        for ind in order_centroids[i, :10]:
            f.write(' %s' % feature_names[ind],)
            f.write("\n")
        f.write("\n")
        f.write("\n")