In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/amazon-books-reviews/books_data.csv
/kaggle/input/amazon-books-reviews/Books_rating.csv


In [2]:
data = pd.read_csv('/kaggle/input/amazon-books-reviews/Books_rating.csv')

In [3]:
import re
import nltk.corpus
nltk.download('stopwords')
import numpy as np
import pandas as pd
import statistics as st
from typing import Union, Text
from collections.abc import Iterable
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

[nltk_data] Error loading stopwords: <urlopen error [Errno -3]
[nltk_data]     Temporary failure in name resolution>


In [4]:
samples = data.sample(1000)
examples = samples['review/text'].values.tolist() 
ratings = samples['review/helpfulness'].values
ratings[ratings == '0/0'] = '0/1'
ratings = np.array([int(x.split('/')[0]) / int(x.split('/')[1]) for x in ratings])

In [5]:
class TfIdfRater:
    def __init__(self) -> None:
        self.vectorizer = TfidfVectorizer(use_idf = True, min_df=3)
        self.stemmer = PorterStemmer()
        
        self.stop = stopwords.words('english') + ['hey', 'hi'] # stopwords
        self.stemmed_stop = [self.stemmer.stem(x) for x in self.stop] # stemmed stopwords
        
        self.cleaned_texts = []
        self.result = []
        
        # regular expression for cleanup
        self.re_form = r"(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?"
        
    def cleanup(self, text: Text) -> Text:
        """
        Perform text cleaning and preprocessing operations on the input text.

        Parameters:
        text (Text): The input text to be cleaned and preprocessed.

        Returns:
        Text: The preprocessed and cleaned text output.

        Method Overview:
        - Lowercasing the text to ensure uniformity in handling text data.
        - Stripping leading and trailing spaces from the text to eliminate unnecessary whitespace.
        - Removing punctuation and links using a predefined regular expression pattern.
        - Normalizing redundant whitespace by consolidating multiple spaces into single spaces.
        - Stemming the words using the provided stemmer instance.
        - Removing common stop words to refine the text for further analysis.

        Example:
        cleaned_text = cleanup("The quick brown fox jumps over the lazy dog.")
        # Output: 'quick brown fox jump over lazy dog'

        Notes:
        - This function is designed to prepare text data for natural language processing tasks.
        - The stemming and stop word removal steps may result in a loss of information based on specific application requirements.
        """

        text = text.lower() # to lowercase
        text = text.strip() # remove useless spaces
        text = re.sub(self.re_form, "", text) # remove punctuation and links
        text = re.sub(' +', ' ',text) # remove useless spaces again
        text = ' '.join([self.stemmer.stem(word) for word in text.split(' ')]) # stemming
        text = " ".join([word for word in text.split() if word not in self.stemmed_stop]) # remove stopwords
        return text
        
    def rate(self, texts: Iterable[Text]) -> np.array:
        """
        Cleans the input texts, fits a TF-IDF vectorizer, and calculates the mean TF-IDF score for each text, resulting in a rating.

        Parameters:
        texts (Iterable[Text]): A collection of input texts to be rated.

        Returns:
        np.array: An array containing the TF-IDF rating for each input text.

        Method Overview:
        - Cleans each input text using the cleanup method and stores the cleaned texts.
        - Fits a TF-IDF vectorizer to the cleaned texts to prepare for vectorization and TF-IDF score calculation.
        - Calculates the mean TF-IDF score for each cleaned text, resulting in a rating.
        - Scales the calculated ratings based on the maximum IDF value observed across all texts.

        Example:
        rated_texts = rate(["This is an example text.", "Another example for rating."])
        # Output: array([0.63, 1.00])

        Notes:
        - This function processes the input texts, calculates their TF-IDF ratings, and ensures that the ratings are scaled appropriately based on the maximum IDF value observed across all texts.
        """

        for text in texts:
            cleaned_text = self.cleanup(text)
            self.cleaned_texts.append(cleaned_text)

        self.vectorizer.fit(self.cleaned_texts)

        mx_len = max([len(x) for x in self.cleaned_texts])
        for idx, cleaned_text in enumerate(self.cleaned_texts):
            out = self.vectorizer.transform([cleaned_text]).mean()
            self.result.append(out)
        mx_idf = max(self.result)
        self.result = np.array(self.result) / mx_idf # scaling
        return self.result

In [9]:
rater = TfIdfRater()
out = rater.rate(examples)
st.correlation(out, ratings)

0.31754623604460247

In [12]:
examples = np.array(examples)

In [15]:
examples[out.argsort()][-3:]

array(["The Scarlet Letter is truly one of literature's greatest triumphs, its characters and themes reverberating in our collective consciousness more than 150 years after its initial publication. Few novels inspire as much contemplation and feeling on the part of the reader. Hester Prynne, American fiction's first and foremost female heroine continues to haunt this world, inspiring a never-ending stream of scholarly debate. Even in our less puritanical age, some doubtless see her as a villainously great temptress, but to me she is a remarkably brave hero indeed. Her sin is known to all, and she never runs away from it, bearing the scarlet letter on her bosom bravely for all to see; she realizes the true measure of that sin, fretting constantly over the effects it will have on young Pearl, remaining steadfast in her beliefs while at the same time envisioning a new society where women and men can exist on more equal terms, free of the stultifyingly harsh punishments meted out on even t

In [16]:
examples[out.argsort()][:3]

array(['He kills humanit',
       'i love this book more than any other. it is brilliant',
       'Vinte mil l&eacute;guas submarinas &eacute; um cl&aacute;ssico de fico cient&iacute;fica, escrito por Julio Verne, em 1870. A diferena &eacute; que trata-se de uma aventura submarina.O livro conta a est&oacute;ria &eacute;pica do Capito Nemo e de seu submarino Nautilus, do Polo Sul, at&eacute; a cidade submersa de Atlntida, pela perspectiva do Professor Pierre Aronnax, em direo &agrave; destruio total de sua tripulao.A forma de contar uma est&oacute;ria atrav&eacute;s de um observador intelectual, para melhor explicar as caracter&iacute;sticas psicol&oacute;gicas de um personagem, era muito popular na &eacute;poca (pensem no Watson de Sherlock Holmes e no Capito Hastings de Hercule Poirot).Essa premissa &eacute; importante, principalmente na cena em que, ap&oacute;s a destruio, ele faz uma homenagem a uma foto de sua esposa e filhos, sugerindo que sua fam&iacute;lia teria sido sequestrada