## Version using TF-IDF

In [None]:
# Import libraries
import nltk
from pyspark.ml import Pipeline

nltk.download('stopwords')

from pyspark.sql.functions import *
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType
from pyspark.ml.feature import Tokenizer, StopWordsRemover
from nltk.corpus import stopwords
stopwords = stopwords.words('english')

# Initialize spark
import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer, HashingTF, IDF
from pyspark.ml.classification import NaiveBayes
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Initialize SparkSession
spark = SparkSession.builder.appName("MultinomialNBC").getOrCreate()

# Define the schema
ratings_schema = StructType([
    StructField("Title", StringType(), True),
    StructField("Price", FloatType(), True),
    StructField("User_id", IntegerType(), True),
    StructField("profileName", StringType(), True),
    StructField("review/score", FloatType(), True),
    StructField("review/time", IntegerType(), True),
    StructField("review/summary", StringType(), True),
    StructField("review/text", StringType(), True),
    StructField("N_helpful", IntegerType(), True),
    StructField("Tot_votes", IntegerType(), True)
])

# Load your DataFrame (assuming you have it in a variable df)
# Load the data
df_ratings = spark.read.csv('hdfs://localhost:9900/user/book_reviews/books_rating_cleaned.csv', header=True, schema=ratings_schema, sep='\t')
df_ratings.show(5)

# Filter out the data
df_ratings_filtered = df_ratings.filter(df_ratings['review/text'].isNotNull())
df_ratings_filtered = df_ratings_filtered.filter(df_ratings_filtered['review/score'] != 3)
df_ratings_filtered = df_ratings_filtered.filter(df_ratings_filtered['Tot_votes'] != 0)

# Add the helpfulness ratio column
df_ratings_filtered = df_ratings_filtered.withColumn('helpfulness_ratio', df_ratings_filtered['N_helpful']/df_ratings_filtered['Tot_votes']*sqrt(df_ratings_filtered['Tot_votes']))

# Add the class column
df_ratings_filtered = df_ratings_filtered.withColumn('class', when(df_ratings_filtered['review/score'] >= 4, 1).otherwise(0))

# Retain only the required columns
df_ratings_selected = df_ratings_filtered.select('review/text', 'helpfulness_ratio', 'class')
df_ratings_selected.show(5)

# Select relevant columns and handle missing values
df = df_ratings_selected.select("class", "review/text").na.drop()

# Tokenize the 'review/text' column
tokenizer = Tokenizer(inputCol="review/text", outputCol="words")
wordsData = tokenizer.transform(df)

# Apply TF-IDF to convert text data to numerical features
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=2000)
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(hashingTF.transform(wordsData))
rescaledData = idfModel.transform(hashingTF.transform(wordsData))

# Split the data into training and testing sets (70% training, 30% testing)
(trainingData, testData) = rescaledData.randomSplit([0.7, 0.3], seed=123)

# Create and train a Multinomial Naive Bayes classifier
nb = NaiveBayes(labelCol="class", featuresCol="features", smoothing=1.0, modelType="multinomial")
model = nb.fit(trainingData)

# Make predictions on the testing data
predictions = model.transform(testData)

# Evaluate the model
evaluator = MulticlassClassificationEvaluator(labelCol="class", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print(f"Accuracy: {accuracy}")




## Version using BoW

In [None]:
# Import libraries
import nltk
from pyspark.ml import Pipeline

nltk.download('stopwords')

from pyspark.sql.functions import *
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType
from pyspark.ml.feature import Tokenizer, StopWordsRemover
from nltk.corpus import stopwords
stopwords = stopwords.words('english')

# Initialize spark
import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer, HashingTF, IDF
from pyspark.ml.classification import NaiveBayes
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import CountVectorizer

# Initialize SparkSession
spark = SparkSession.builder.appName("MultinomialNBC").getOrCreate()

# Define the schema
ratings_schema = StructType([
    StructField("Title", StringType(), True),
    StructField("Price", FloatType(), True),
    StructField("User_id", IntegerType(), True),
    StructField("profileName", StringType(), True),
    StructField("review/score", FloatType(), True),
    StructField("review/time", IntegerType(), True),
    StructField("review/summary", StringType(), True),
    StructField("review/text", StringType(), True),
    StructField("N_helpful", IntegerType(), True),
    StructField("Tot_votes", IntegerType(), True)
])

# Load your DataFrame (assuming you have it in a variable df)
# Load the data
df_ratings = spark.read.csv('hdfs://localhost:9900/user/book_reviews/books_rating_cleaned.csv', header=True, schema=ratings_schema, sep='\t')
df_ratings.show(5)

# Filter out the data
df_ratings_filtered = df_ratings.filter(df_ratings['review/text'].isNotNull())
df_ratings_filtered = df_ratings_filtered.filter(df_ratings_filtered['review/score'] != 3)
df_ratings_filtered = df_ratings_filtered.filter(df_ratings_filtered['Tot_votes'] != 0)

# Add the helpfulness ratio column
df_ratings_filtered = df_ratings_filtered.withColumn('helpfulness_ratio', df_ratings_filtered['N_helpful']/df_ratings_filtered['Tot_votes']*sqrt(df_ratings_filtered['Tot_votes']))

# Add the class column
df_ratings_filtered = df_ratings_filtered.withColumn('class', when(df_ratings_filtered['review/score'] >= 4, 1).otherwise(0))

# Retain only the required columns
df_ratings_selected = df_ratings_filtered.select('review/text', 'helpfulness_ratio', 'class')
df_ratings_selected.show(5)

# Select relevant columns and handle missing values
df = df_ratings_selected.select("class", "review/text").na.drop()


# Tokenize the 'review/text' column
tokenizer = Tokenizer(inputCol="review/text", outputCol="words")
wordsData = tokenizer.transform(df)

# Apply Bag of Words (BoW) to convert text data to numerical features
vectorizer = CountVectorizer(inputCol="words", outputCol="features", vocabSize=10000, minDF=5)
model_cv = vectorizer.fit(wordsData)
rescaledData = model_cv.transform(wordsData)

# Split the data into training and testing sets (70% training, 30% testing)
(trainingData, testData) = rescaledData.randomSplit([0.7, 0.3], seed=123)

# Create and train a Multinomial Naive Bayes classifier
nb = NaiveBayes(labelCol="class", featuresCol="features", smoothing=1.0, modelType="multinomial")
model_nb = nb.fit(trainingData)

# Make predictions on the testing data
predictions = model_nb.transform(testData)

# Evaluate the model
evaluator = MulticlassClassificationEvaluator(labelCol="class", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print(f"Accuracy: {accuracy}")

## Possible implementation to count the words (TO CHECK)

In [1]:
# Import libraries
import nltk
from pyspark.ml import Pipeline

nltk.download('stopwords')

from pyspark.sql.functions import *
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType
from pyspark.ml.feature import Tokenizer, StopWordsRemover
from nltk.corpus import stopwords
stopwords = stopwords.words('english')

# Initialize spark
import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer, HashingTF, IDF
from pyspark.ml.classification import NaiveBayes
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import CountVectorizer

# Import necessary libraries
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer
from pyspark.ml.classification import NaiveBayes
from pyspark.ml import Pipeline
from pyspark.sql import SparkSession

# Initialize SparkSession
spark = SparkSession.builder.appName("MultinomialNBC").getOrCreate()

# Define the schema
ratings_schema = StructType([
    StructField("Title", StringType(), True),
    StructField("Price", FloatType(), True),
    StructField("User_id", IntegerType(), True),
    StructField("profileName", StringType(), True),
    StructField("review/score", FloatType(), True),
    StructField("review/time", IntegerType(), True),
    StructField("review/summary", StringType(), True),
    StructField("review/text", StringType(), True),
    StructField("N_helpful", IntegerType(), True),
    StructField("Tot_votes", IntegerType(), True)
])

# Load your DataFrame (assuming you have it in a variable df)
# Load the data
df_ratings = spark.read.csv('hdfs://localhost:9900/user/book_reviews/books_rating_cleaned.csv', header=True, schema=ratings_schema, sep='\t')
df_ratings.show(5)

# Filter out the data
df_ratings_filtered = df_ratings.filter(df_ratings['review/text'].isNotNull())
df_ratings_filtered = df_ratings_filtered.filter(df_ratings_filtered['review/score'] != 3)
df_ratings_filtered = df_ratings_filtered.filter(df_ratings_filtered['Tot_votes'] != 0)

# Add the helpfulness ratio column
df_ratings_filtered = df_ratings_filtered.withColumn('helpfulness_ratio', df_ratings_filtered['N_helpful']/df_ratings_filtered['Tot_votes']*sqrt(df_ratings_filtered['Tot_votes']))

# Add the class column
df_ratings_filtered = df_ratings_filtered.withColumn('class', when(df_ratings_filtered['review/score'] >= 4, 1).otherwise(0))

# Retain only the required columns
df_ratings_selected = df_ratings_filtered.select('review/text', 'helpfulness_ratio', 'class')
df_ratings_selected.show(5)

# Select relevant columns and handle missing values
df = df_ratings_selected.select("class", "review/text").na.drop()







# Tokenize the 'review/text' column
tokenizer = Tokenizer(inputCol="review/text", outputCol="words")

# Remove stopwords
stop_words_remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")

# Convert words to a BoW feature vector
vectorizer = CountVectorizer(inputCol="filtered_words", outputCol="features")

# Create a Naive Bayes model
nb = NaiveBayes(labelCol="class", featuresCol="features", predictionCol="prediction")

# Create a pipeline
pipeline = Pipeline(stages=[tokenizer, stop_words_remover, vectorizer, nb])


# Fit the pipeline on your data
model = pipeline.fit(df)

# Fit the Multinomial Naive Bayes model on the training data
nb_model = model.stages[-1]

# Get the vocabulary
vocabulary = model.stages[2].vocabulary

# Get the word probabilities for class 1
class_1_probs = nb_model.theta.toArray()[1]



import pandas as pd
results = pd.DataFrame({'word': vocabulary, 'prob': class_1_probs})

# Sort the DataFrame by descending word probabilities and take top 2000
results = results.sort_values(by='prob', ascending=False).head(2000)
results



[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/andreaalberti/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
23/09/21 18:00:53 WARN Utils: Your hostname, MacBook-Pro-di-Andrea.local resolves to a loopback address: 127.0.0.1; using 192.168.1.148 instead (on interface en0)
23/09/21 18:00:53 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/09/21 18:00:53 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


+--------------------+-----+-------+--------------------+------------+-----------+--------------------+--------------------+---------+---------+
|               Title|Price|User_id|         profileName|review/score|review/time|      review/summary|         review/text|N_helpful|Tot_votes|
+--------------------+-----+-------+--------------------+------------+-----------+--------------------+--------------------+---------+---------+
|'Its Only Art If ...| null|   null|Jim of Oz "jim-of...|         4.0|  940636800|Nice collection o...|This is only for ...|        7|        7|
|'Dr. Seuss: Ameri...| null|   null|       Kevin Killian|         5.0| 1095724800|   Really Enjoyed It|I don't care much...|       10|       10|
|'Dr. Seuss: Ameri...| null|   null|        John Granger|         5.0| 1078790400|Essential for eve...|If people become ...|       10|       11|
|'Dr. Seuss: Ameri...| null|   null|Roy E. Perry "ama...|         4.0| 1090713600|Phlip Nel gives s...|Theodore Seuss Ge...|      



CodeCache: size=131072Kb used=22902Kb max_used=24185Kb free=108169Kb
 bounds [0x000000010a1d8000, 0x000000010b998000, 0x00000001121d8000]
 total_blobs=9423 nmethods=8434 adapters=902
 compilation: disabled (not enough contiguous free space left)


23/09/21 18:02:07 WARN MemoryStore: Not enough space to cache rdd_17_11 in memory! (computed 15.3 MiB so far)
23/09/21 18:02:07 WARN BlockManager: Persisting block rdd_17_11 to disk instead.
23/09/21 18:02:07 WARN MemoryStore: Not enough space to cache rdd_17_12 in memory! (computed 13.5 MiB so far)
23/09/21 18:02:07 WARN BlockManager: Persisting block rdd_17_12 to disk instead.
23/09/21 18:02:18 WARN DAGScheduler: Broadcasting large task binary with size 2.9 MiB
23/09/21 18:03:07 WARN DAGScheduler: Broadcasting large task binary with size 2.9 MiB
                                                                                

Unnamed: 0,word,prob
0,book,-4.033910
1,read,-4.877841
2,one,-4.920655
3,like,-5.419392
4,story,-5.570671
...,...,...
2059,soldiers,-9.490721
2113,plans,-9.491529
1818,gotten,-9.491731
1995,mass,-9.491832
