In [None]:
###KEYWORD EXTRACTION###

#Each book must have a non empty description of at least 50 characters.

#SPACY PIPELINE FUNCTIONS
#Keyword extraction with TF-IDF

import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
import csv
from pyspark.sql.functions import regexp_replace

def scores_ordering(coo_matrix): #Function to order the matrix by score
        tuples = zip(coo_matrix.col, coo_matrix.data) #COLUMNS: FEATURES ROWS: DESCRIPTIONS
        return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)

def top_extraction(feature_names, sorted_items, topn=5): #Function to obtain N keywords
    #print(feature_names)
    sorted_items = sorted_items[:topn]
    score_vals = []
    feature_vals = []

    for idx, score in sorted_items:

        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])

    results= {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]
    return results

def get_keywords(vectorizer, feature_names, doc, k): #Funzione usata per ritornare le top k parole chiave di una descrizione
    tf_idf_vector = vectorizer.transform([doc])
    sorted_items=scores_ordering(tf_idf_vector.tocoo())
    keywords=top_extraction(feature_names,sorted_items, k) #20 keyword per ora

    return list(keywords.keys())

def keywordCoverage(dictionary, dataframe): #To calculate the coverage of a keyword in the dataset
    
    output_list = []
    
    temp_dictionary = dictionary.copy()
    temp_subset = dataframe['top_keywords'].to_list()
    while True:
        
        max_coverage_count = 0
        chosen_keyword = ""
        
        for keyword in temp_dictionary:
        
            coverage_count = len([item for item in temp_subset if keyword in item])
            if coverage_count > max_coverage_count:
                max_coverage_count = coverage_count
                chosen_keyword = keyword
            
        try:
            choice = input(f"La parola scelta è {chosen_keyword}. Ha una coverage di {max_coverage_count}. La lunghezza rimanente del dataset è {len(temp_subset)}. (y/n)")

            if choice == "y":
                temp_dictionary.pop(chosen_keyword)
                output_list.append(chosen_keyword)
                temp_subset = [item for item in temp_subset if chosen_keyword not in item]
                print("KEYWORD SCELTA")
                if len(temp_subset) == 0:
                    break
                if len(output_list) == 20:
                    print("Limite di 20 keywords raggiunto. Ritorno la lista di keywords del genere.")
                    return output_list 
            elif choice == "n":
                temp_dictionary.pop(chosen_keyword)
                print("KEYWORD SCARTATA")
                if len(temp_subset) == 0:
                    break
            else:
                raise ValueError

        except ValueError:
            print("Scegliere una scelta possibile (y/n).")
        
    return output_list

def countOccurrencies(dictionary, reference_list): #Count occurrences in keyword list
        for keyword in reference_list:
            if keyword in dictionary:
                dictionary[keyword] += 1
            elif keyword not in dictionary:
                dictionary[keyword] = 1
        return dictionary

def clean_text_light(doc_collection): #FILTERING NOTABLE MEANINGLESS WORDS
    new_corpus_doc = []
    for description in doc_collection:
        description = description.replace("agrave", " ")
        description = description.replace("egrave", " ")
        description = description.replace("igrave", " ")
        description = description.replace("ograve", " ")
        description = description.replace("ograve", " ")
        description = description.replace("<b>", " ")
        description = description.replace(",", " ")
        description = description.replace("'", " ")
        description = description.replace('"', " ")
        description = description.replace("<b>", " ")
        description = description.replace("<br />", " ")
        description = description.replace("</b>", " ")
        description = description.replace("<br>", " ")
        description = description.replace("</br>", " ")
        description = description.replace("<p>", " ")
        description = description.replace("</p>", " ")
        description = description.replace("<P>", " ")
        description = description.replace("</P>", " ")
        description = description.replace("<i>", " ")
        description = description.replace("</i>", " ")
        description = description.replace("&quot", " ")
        description = description.replace("<strong>", " ")
        description = description.replace("</strong>", " ")
        description = description.replace("&", " ")  
        #KEYWORD PRESENTI MA NON MOLTO UTILI
        pattern = re.compile(re.escape("leggere"), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape("lettrice"), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape("lettore"), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape("lettori"), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape("lettura"), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape("libro"), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape("libri"), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape(" mese "), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape(" mesi "), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape("tempo "), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape("tempi "), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape("parte "), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape("parti "), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape(" cosa "), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape(" cose "), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape(" personaggio "), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape(" personaggi "), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape(" volume "), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape(" volumi"), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape(" protagonista"), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape(" protagonisti"), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape(" testo "), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape(" testi "), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape(" pagina "), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape(" pagine "), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape(" secolo "), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape(" secoli "), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape(" persona "), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape(" persone "), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape("scrittore "), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape(" scrittrice "), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape(" scrittrici "), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape(" scrittori "), re.IGNORECASE)
        description = pattern.sub(' ', description)
        #pattern = re.compile(re.escape(" vic"), re.IGNORECASE)
        #description = pattern.sub(' ', description)
        pattern = re.compile(re.escape(" fine "), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape("nome "), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape("nomi "), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape("inizio "), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape(" parola "), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape(" parole "), re.IGNORECASE)
        description = pattern.sub(' ', description)
        #pattern = re.compile(re.escape(" ista"), re.IGNORECASE)
        #description = pattern.sub(' ', description)
        pattern = re.compile(re.escape("modo "), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape(" numero "), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape(" punto "), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape(" centro "), re.IGNORECASE)
        description = pattern.sub(' ', description)
        #pattern = re.compile(re.escape("ora "), re.IGNORECASE)
        #description = pattern.sub(' ', description)
        pattern = re.compile(re.escape(" frattempo "), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape(" mezzo "), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape(" mezzi "), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape(" corso "), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape(" situazione "), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape(" piano "), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape(" via "), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape(" vie "), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape(" forma "), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape(" forme "), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape(" posto "), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape(" posti "), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape(" fronte "), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape(" luogo "), re.IGNORECASE)
        description = pattern.sub(' ', description)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape("autore "), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape(" autori "), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape(" autrici "), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape("autrice "), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape(" grazie "), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape(" fatto "), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape(" fatti "), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape("edizione "), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape(" edizioni "), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape(" opere "), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape(" serie "), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape(" tema "), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape(" temi "), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape(" grado "), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape(" gradi "), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape(" genere "), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape(" generi "), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape(" titolo "), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape(" titoli "), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape("oggetto "), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape(" oggetti "), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape(" tempo "), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape(" tempi "), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape(" sfondo "), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape(" sfondi "), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape(" conto "), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape(" conti "), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape(" volta "), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape(" volte "), re.IGNORECASE)
        description = pattern.sub(' ', description)
        #pattern = re.compile(re.escape(" race"), re.IGNORECASE)
        #description = pattern.sub(' ', description)
        pattern = re.compile(re.escape(" scena "), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape(" scene "), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape("età "), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape(" figura "), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape(" figure "), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape("epoca "), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape(" epoche"), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape("none"), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape("libreria"), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape("librerie"), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape("biblioteca"), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape("biblioteche"), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape("generazione"), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape("generazioni"), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape(" mano "), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape(" mani "), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape("aspetto "), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape("titolo"), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape("produzione"), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape(" senso "), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape(" trama "), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape(" trame "), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape("racconto"), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape("racconti"), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape("capitolo"), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape("capitoli"), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape(" fondo "), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape("domanda"), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape("risposta"), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape("domande"), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape("risposte"), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape("versione"), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape("argomento"), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape("argomenti"), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape(" tesi "), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape("italiano"), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape("effetto"), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape(" carta "), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape("bisogno"), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape("bisogni"), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape("analisi"), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape("momento"), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape("milione"), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape("disegno"), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape(" dono "), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape("successo"), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape("traduzione"), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape(" penna "), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape("regola"), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape("regole"), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape("progetto"), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape("esempio"), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape("narrazione"), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape(" data "), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape("pratica"), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape("frase"), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape("ultimo"), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape("base"), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape("effetto"), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape("attenzione"), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape("motivo"), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape("essere"), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape("linguaggio"), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape(" lingua "), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape("lavoro"), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape("critica"), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape("aspetto"), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape("introduzione"), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape("episodio"), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape("livello"), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape("pubblicazione"), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape(" pezzo "), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape("interesse"), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape("consiglio"), re.IGNORECASE)
        description = pattern.sub(' ', description)
        pattern = re.compile(re.escape("effetto"), re.IGNORECASE)
        description = pattern.sub(' ', description)
        #pattern = re.compile(re.escape("ore"), re.IGNORECASE)
        #description = pattern.sub(' ', description)
       
        new_corpus_doc.append(description)
    return new_corpus_doc

In [None]:
#NOTE: BOOKS_TOCSV IS THE OUTPUT OF THE MERGED DATASET.

#FEATURES CREATION: KEYWORDS PER EACH BOOK CONSIDERING COMMENTS

#WITH COMMENTS 80 LENGTH

DFcontent_pandas = books_tocsv.toPandas()
corpus_doc = DFcontent_pandas['content'].to_list() #Descriptions collection
cleaned_corpus = clean_text_light(corpus_doc) #Filtering meaningless and notable words
id_list = DFcontent_pandas['book_id'].to_list()
new_corpus_doc = []
nlp = spacy.load("it_core_news_lg") #Load spacy dictionary file to keeep just nouns 

for document in cleaned_corpus:
    noun_tokens = []
    doc = nlp(document)
    for token in doc:
        if (token.pos_ == "NOUN") and token.is_oov==False: #Only vocabulary names
            noun_tokens.append(token.lemma_)
    new_document = (" ").join(noun_tokens)
    new_corpus_doc.append(new_document)
vectorizer = TfidfVectorizer(smooth_idf=True, use_idf=True) 
vectorizer.fit_transform(new_corpus_doc) #Dictionary creation
feature_names = vectorizer.get_feature_names_out() #Terms list

result = []
for descrizione, identifier in zip(new_corpus_doc, id_list): #keywprds features (variable k)
    df = {}
    df['book_id'] = identifier
    df['top_keywords_5_with_comment'] = get_keywords(vectorizer, feature_names, descrizione, 5)
    df['top_keywords_10_with_comment'] = get_keywords(vectorizer, feature_names, descrizione, 10)
    df['top_keywords_15_with_comment'] = get_keywords(vectorizer, feature_names, descrizione, 15)
    df['top_keywords_20_with_comment'] = get_keywords(vectorizer, feature_names, descrizione, 20)
    result.append(df)

DFkeywords = pd.DataFrame(result)

DFbooks_integrated = DFcontent_pandas.merge(DFkeywords, on='book_id')

In [None]:
#WITHOUT COMMENTS

DFcontent_pandas = books_tocsv.toPandas()
corpus_doc = DFcontent_pandas['content2'].to_list()
cleaned_corpus = clean_text_light(corpus_doc)
id_list = DFcontent_pandas['book_id'].to_list()
new_corpus_doc = []

for document in cleaned_corpus:
    noun_tokens = []
    doc = nlp(document)
    for token in doc:
        if (token.pos_ == "NOUN") and token.is_oov==False: #Only vocabulary names
            noun_tokens.append(token.lemma_)
    new_document = (" ").join(noun_tokens)
    new_corpus_doc.append(new_document)
vectorizer = TfidfVectorizer(smooth_idf=True, use_idf=True) 
vectorizer.fit_transform(new_corpus_doc) #Dictionary creation
feature_names = vectorizer.get_feature_names_out() #Terms list

result = []
for descrizione, identifier in zip(new_corpus_doc, id_list):
    df = {}
    df['book_id'] = identifier
    df['top_keywords_5_without_comm'] = get_keywords(vectorizer, feature_names, descrizione, 5)
    df['top_keywords_10_without_comm'] = get_keywords(vectorizer, feature_names, descrizione, 10)
    df['top_keywords_15_without_comm'] = get_keywords(vectorizer, feature_names, descrizione, 15)
    df['top_keywords_20_without_comm'] = get_keywords(vectorizer, feature_names, descrizione, 20)
    result.append(df)

DFkeywords = pd.DataFrame(result)

DFbooks_integrated_without = DFbooks_integrated.merge(DFkeywords, on='book_id')

In [None]:
DFbooks_integrated_without.to_csv('2_keywords_and_books.csv')

In [None]:
#GENRE KEYWORDS
import numpy as np

mapping = {'Comics&GraphicNovels': 0,
              'Family-Sex&Relationships': 1,
              'Humor': 2,
              'History': 3,
              'ScienceFiction&Fantasy': 4,
              'Romance': 5,
              'Travel': 6,
              'Mystery&Thrillers': 7,
              'FreeTime': 8,
              'Non-fiction': 9,
              'Biography': 10,
              'SocialScience': 11,
              'Political': 12,
              'Crime': 13,
              'Children&Teens': 14,
              'Philosophy': 15,
              'Horror': 16,
              'Health-Mind&Body': 17,
              'Professional&Technical': 18,
              'Science&Nature': 19}
              #'Fiction&Literature': 20}

DFchosen_keywords = pd.DataFrame()
DFcontent_pandas = books_tocsv.toPandas()
possible_keywords = [50]
average_percentages = average_percentages = [49.086402663093516, 65.79894419074137, 75.69834836556531, 82.83928946909826, 91.31414622247611]
for number_keywords in possible_keywords:
    percentages = []
    for key in mapping:
        chosen_genre_keywords = []
        current_coverage = 0
        print(f'Total number of books is {DFcontent_pandas.shape[0]}') 
        masking = DFcontent_pandas.genre.apply(lambda x: key in x.keys())
        print(f'The current genre is {key}')
        DFcontent_pandas_masked = DFcontent_pandas[masking]
        print(f'Number of books of that genre is {DFcontent_pandas_masked.shape[0]}')
        corpus_doc = DFcontent_pandas_masked['content'].to_list()
        cleaned_corpus_doc = clean_text_light(corpus_doc) #Pulizia rumori

        max_coverage = DFcontent_pandas_masked.shape[0]

        new_corpus_doc = []
        for document in cleaned_corpus_doc:
            noun_tokens = []
            doc = nlp(document)
            for token in doc:
                if (token.pos_ == "NOUN") and token.is_oov==False: #Solo nomi presenti nel vocabolario italiano
                    noun_tokens.append(token.lemma_)
            new_document = (" ").join(noun_tokens)
            new_corpus_doc.append(new_document)
        vectorizer = TfidfVectorizer(smooth_idf=True, use_idf=True) #Di solito 0.60, 40
        vectorizer.fit_transform(new_corpus_doc) #CREAZIONE DIZIONARIO CON I VOCABOLI CONTENUTI NELLE DESCRIZIONI
        feature_names = vectorizer.get_feature_names_out() #Estrae le feature (i vocaboli) importanti

        id_list = DFcontent_pandas_masked['book_id'].to_list()
        result = []
        for descrizione, identifier in zip(new_corpus_doc, id_list):
            df = {}
            df['book_id'] = identifier
            df['top_keywords_20'] = get_keywords(vectorizer, feature_names, descrizione, 50)
            result.append(df)

        DFkeywords = pd.DataFrame(result)
        DFkeywords_copy = DFkeywords.copy()
        #Take the first 20 keywords
        #print(DFkeywords.shape)
        for i in range(0,20):
            if DFkeywords_copy.shape[0] == 0:
                break
            keywords_list = DFkeywords_copy['top_keywords_20'].to_list()
            flat_list = [keyword for keywords_sublist in keywords_list for keyword in keywords_sublist]

            #Counting occurrences of keywords
            dict_occurrences = {}
            for keyword in flat_list:
                if keyword in dict_occurrences.keys():
                    dict_occurrences[keyword] += 1
                else:
                    dict_occurrences[keyword] = 1
            dict_occurrences = dict(sorted(dict_occurrences.items(), key=lambda item: item[1], reverse=True))
            chosen_keyword = list(dict_occurrences.keys())[0:1]
            chosen_genre_keywords.append(chosen_keyword[0])
            current_coverage += int(list(dict_occurrences.values())[0])
            coverage_percentage = (current_coverage/max_coverage) * 100
            DFkeywords_mask = DFkeywords_copy.top_keywords_20.apply(lambda x: chosen_keyword[0] not in x)
            DFkeywords_copy = DFkeywords_copy[DFkeywords_mask]
        print(f'You have covered {coverage_percentage}% of the {key} books')  
        print(f'Chosen keywords for genre {key} are:')
        print(chosen_genre_keywords)
        DFchosen_keywords = DFchosen_keywords.append(pd.DataFrame({'genre': key, 'keywords': [chosen_genre_keywords], 'coverage': coverage_percentage}))
        DFchosen_keywords.to_csv(f'50_prova_keywords.csv', index=False)
        print(DFchosen_keywords)
        percentages.append(coverage_percentage)
    percentages_np = np.asarray(percentages)
    percentage_mean = np.mean(percentages_np)
    average_percentages.append(percentage_mean)
    print(average_percentages)
    
if False:
    #Keywords graph coverage
    possible_keywords = [5, 10, 15, 20, 30, 50]
    average_percentages = [49.086402663093516, 65.79894419074137, 75.69834836556531, 82.83928946909826, 91.31414622247611, 98.63530510583139]


    import matplotlib.pyplot as plt

    plt.figure()
    plt.plot(possible_keywords, average_percentages)
    plt.xlabel('Number of keywords per book')
    plt.ylabel('Coverage percentage')
    plt.show()

In [None]:
#KEYWORD MANUAL TUNING
while True:
    print('Welcome to the manual tuning interface. Choose a genre.')
    print('Write exit to close the interface.')
    DFkeywords_50 = pd.read_csv('50_prova_keywords.csv')
    display(DFkeywords_50)
    genre = input()
    if genre == 'exit':
        break
    elif genre not in list(mapping.keys()):
        print('Error, not existing genre')
    else:
        print('These are the chosen keywords for that genre.')
        list_keywords = DFkeywords_50['keywords'].to_list()
        coverage = DFkeywords_50['coverage'].to_list()[mapping[genre]]
        coverage_start = 0
        print(list_keywords[mapping[genre]])
        print('Choose a keyword to substitute or write exit to exit.')
        substitute = input()
        list_words = eval(list_keywords[mapping[genre]])
        
        if substitute == 'exit':
            break
        elif substitute not in list_words:
            print()
            print('Error, keyword not present.')
        else:
            print('Calculating possible keywords..')
            chosen_genre_keywords = []
            masking = DFcontent_pandas.genre.apply(lambda x: genre in x.keys())
            DFcontent_pandas_masked = DFcontent_pandas[masking]
            corpus_doc = DFcontent_pandas_masked['content'].to_list()
            cleaned_corpus_doc = clean_text_light(corpus_doc) #Pulizia rumori

            max_coverage = DFcontent_pandas_masked.shape[0]

            new_corpus_doc = []
            for document in cleaned_corpus_doc:
                noun_tokens = []
                doc = nlp(document)
                for token in doc:
                    if (token.pos_ == "NOUN") and token.is_oov==False: #Solo nomi presenti nel vocabolario italiano
                        noun_tokens.append(token.lemma_)
                new_document = (" ").join(noun_tokens)
                new_corpus_doc.append(new_document)
            vectorizer = TfidfVectorizer(smooth_idf=True, use_idf=True) #Di solito 0.60, 40
            vectorizer.fit_transform(new_corpus_doc) #CREAZIONE DIZIONARIO CON I VOCABOLI CONTENUTI NELLE DESCRIZIONI
            feature_names = vectorizer.get_feature_names_out() #Estrae le feature (i vocaboli) importanti

            id_list = DFcontent_pandas_masked['book_id'].to_list()
            result = []
            for descrizione, identifier in zip(new_corpus_doc, id_list):
                df = {}
                df['book_id'] = identifier
                df['top_keywords_20'] = get_keywords(vectorizer, feature_names, descrizione, 50)
                result.append(df)

            DFkeywords = pd.DataFrame(result)
            DFkeywords_copy = DFkeywords.copy()
            keywords_list = DFkeywords_copy['top_keywords_20'].to_list()
            flat_list = [keyword for keywords_sublist in keywords_list for keyword in keywords_sublist]
               
            #Counting occurrences of keywords
            dict_occurrences = {}
            for keyword in flat_list:
                if keyword in dict_occurrences.keys():
                    dict_occurrences[keyword] += 1
                else:
                    dict_occurrences[keyword] = 1
            dict_occurrences = dict(sorted(dict_occurrences.items(), key=lambda item: item[1], reverse=True))
            print(dict_occurrences)
            print('Choose a word from this dictionary or write exit to exit.')
            word_input = input()
            if word_input=='exit':
                break
            elif word_input not in list(dict_occurrences.keys()):
                print('Word not existing in the dictionary')
            else:         
                chosen_keyword = word_input
                list_words.remove(substitute)
                chosen_genre_keywords = list_words + [chosen_keyword]
                
                print(chosen_genre_keywords)
                for key in chosen_genre_keywords:
                    coverage_start += dict_occurrences[key]
                if coverage_start > max_coverage:
                    coverage_start = max_coverage
                coverage_percentage = (coverage_start/max_coverage) * 100
            print(f'Coverage goes from {coverage} to {coverage_percentage}. Proceed? (y/n)')  
            final_choice = input()
            if final_choice != 'y' and final_choice != 'n':
                print('Impossible choice.')
            elif final_choice == 'n':
                print('Returning...')
                break
            elif final_choice == 'y':
                DFkeywords_50.loc[mapping[genre]] = genre, chosen_genre_keywords, coverage_percentage
            display(DFkeywords_50)
    