## Version using TF-IDF

In [None]:
# Import libraries
import nltk
from pyspark.ml import Pipeline

nltk.download('stopwords')

from pyspark.sql.functions import *
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType
from pyspark.ml.feature import Tokenizer, StopWordsRemover
from nltk.corpus import stopwords
stopwords = stopwords.words('english')

# Initialize spark
import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer, HashingTF, IDF
from pyspark.ml.classification import NaiveBayes
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Initialize SparkSession
spark = SparkSession.builder.appName("MultinomialNBC").getOrCreate()

# Define the schema
ratings_schema = StructType([
    StructField("Title", StringType(), True),
    StructField("Price", FloatType(), True),
    StructField("User_id", IntegerType(), True),
    StructField("profileName", StringType(), True),
    StructField("review/score", FloatType(), True),
    StructField("review/time", IntegerType(), True),
    StructField("review/summary", StringType(), True),
    StructField("review/text", StringType(), True),
    StructField("N_helpful", IntegerType(), True),
    StructField("Tot_votes", IntegerType(), True)
])

# Load your DataFrame (assuming you have it in a variable df)
# Load the data
df_ratings = spark.read.csv('hdfs://localhost:9900/user/book_reviews/books_rating_cleaned.csv', header=True, schema=ratings_schema, sep='\t')
df_ratings.show(5)

# Filter out the data
df_ratings_filtered = df_ratings.filter(df_ratings['review/text'].isNotNull())
df_ratings_filtered = df_ratings_filtered.filter(df_ratings_filtered['review/score'] != 3)
df_ratings_filtered = df_ratings_filtered.filter(df_ratings_filtered['Tot_votes'] != 0)

# Add the helpfulness ratio column
df_ratings_filtered = df_ratings_filtered.withColumn('helpfulness_ratio', df_ratings_filtered['N_helpful']/df_ratings_filtered['Tot_votes']*sqrt(df_ratings_filtered['Tot_votes']))

# Add the class column
df_ratings_filtered = df_ratings_filtered.withColumn('class', when(df_ratings_filtered['review/score'] >= 4, 1).otherwise(0))

# Retain only the required columns
df_ratings_selected = df_ratings_filtered.select('review/text', 'helpfulness_ratio', 'class')
df_ratings_selected.show(5)

# Select relevant columns and handle missing values
df = df_ratings_selected.select("class", "review/text").na.drop()

# Tokenize the 'review/text' column
tokenizer = Tokenizer(inputCol="review/text", outputCol="words")
wordsData = tokenizer.transform(df)

# Apply TF-IDF to convert text data to numerical features
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=2000)
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(hashingTF.transform(wordsData))
rescaledData = idfModel.transform(hashingTF.transform(wordsData))

# Split the data into training and testing sets (70% training, 30% testing)
(trainingData, testData) = rescaledData.randomSplit([0.7, 0.3], seed=123)

# Create and train a Multinomial Naive Bayes classifier
nb = NaiveBayes(labelCol="class", featuresCol="features", smoothing=1.0, modelType="multinomial")
model = nb.fit(trainingData)

# Make predictions on the testing data
predictions = model.transform(testData)

# Evaluate the model
evaluator = MulticlassClassificationEvaluator(labelCol="class", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print(f"Accuracy: {accuracy}")




[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/andreaalberti/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


+--------------------+-----+-------+--------------------+------------+-----------+--------------------+--------------------+---------+---------+
|               Title|Price|User_id|         profileName|review/score|review/time|      review/summary|         review/text|N_helpful|Tot_votes|
+--------------------+-----+-------+--------------------+------------+-----------+--------------------+--------------------+---------+---------+
|'Its Only Art If ...| null|   null|Jim of Oz "jim-of...|         4.0|  940636800|Nice collection o...|This is only for ...|        7|        7|
|'Dr. Seuss: Ameri...| null|   null|       Kevin Killian|         5.0| 1095724800|   Really Enjoyed It|I don't care much...|       10|       10|
|'Dr. Seuss: Ameri...| null|   null|        John Granger|         5.0| 1078790400|Essential for eve...|If people become ...|       10|       11|
|'Dr. Seuss: Ameri...| null|   null|Roy E. Perry "ama...|         4.0| 1090713600|Phlip Nel gives s...|Theodore Seuss Ge...|      



CodeCache: size=131072Kb used=29464Kb max_used=29548Kb free=101607Kb
 bounds [0x000000010a9d8000, 0x000000010c708000, 0x00000001129d8000]
 total_blobs=11831 nmethods=10765 adapters=977
 compilation: disabled (not enough contiguous free space left)


23/09/21 11:56:49 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS

Accuracy: 0.796588805977923


                                                                                

## Version using BoW

In [5]:
# Import libraries
import nltk
from pyspark.ml import Pipeline

nltk.download('stopwords')

from pyspark.sql.functions import *
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType
from pyspark.ml.feature import Tokenizer, StopWordsRemover
from nltk.corpus import stopwords
stopwords = stopwords.words('english')

# Initialize spark
import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer, HashingTF, IDF
from pyspark.ml.classification import NaiveBayes
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import CountVectorizer

# Initialize SparkSession
spark = SparkSession.builder.appName("MultinomialNBC").getOrCreate()

# Define the schema
ratings_schema = StructType([
    StructField("Title", StringType(), True),
    StructField("Price", FloatType(), True),
    StructField("User_id", IntegerType(), True),
    StructField("profileName", StringType(), True),
    StructField("review/score", FloatType(), True),
    StructField("review/time", IntegerType(), True),
    StructField("review/summary", StringType(), True),
    StructField("review/text", StringType(), True),
    StructField("N_helpful", IntegerType(), True),
    StructField("Tot_votes", IntegerType(), True)
])

# Load your DataFrame (assuming you have it in a variable df)
# Load the data
df_ratings = spark.read.csv('hdfs://localhost:9900/user/book_reviews/books_rating_cleaned.csv', header=True, schema=ratings_schema, sep='\t')
df_ratings.show(5)

# Filter out the data
df_ratings_filtered = df_ratings.filter(df_ratings['review/text'].isNotNull())
df_ratings_filtered = df_ratings_filtered.filter(df_ratings_filtered['review/score'] != 3)
df_ratings_filtered = df_ratings_filtered.filter(df_ratings_filtered['Tot_votes'] != 0)

# Add the helpfulness ratio column
df_ratings_filtered = df_ratings_filtered.withColumn('helpfulness_ratio', df_ratings_filtered['N_helpful']/df_ratings_filtered['Tot_votes']*sqrt(df_ratings_filtered['Tot_votes']))

# Add the class column
df_ratings_filtered = df_ratings_filtered.withColumn('class', when(df_ratings_filtered['review/score'] >= 4, 1).otherwise(0))

# Retain only the required columns
df_ratings_selected = df_ratings_filtered.select('review/text', 'helpfulness_ratio', 'class')
df_ratings_selected.show(5)

# Select relevant columns and handle missing values
df = df_ratings_selected.select("class", "review/text").na.drop()


# Tokenize the 'review/text' column
tokenizer = Tokenizer(inputCol="review/text", outputCol="words")
wordsData = tokenizer.transform(df)

# Apply Bag of Words (BoW) to convert text data to numerical features
vectorizer = CountVectorizer(inputCol="words", outputCol="features", vocabSize=10000, minDF=5)
model = vectorizer.fit(wordsData)
rescaledData = model.transform(wordsData)

# Split the data into training and testing sets (70% training, 30% testing)
(trainingData, testData) = rescaledData.randomSplit([0.7, 0.3], seed=123)

# Create and train a Multinomial Naive Bayes classifier
nb = NaiveBayes(labelCol="class", featuresCol="features", smoothing=1.0, modelType="multinomial")
model = nb.fit(trainingData)

# Make predictions on the testing data
predictions = model.transform(testData)

# Evaluate the model
evaluator = MulticlassClassificationEvaluator(labelCol="class", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print(f"Accuracy: {accuracy}")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/andreaalberti/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


+--------------------+-----+-------+--------------------+------------+-----------+--------------------+--------------------+---------+---------+
|               Title|Price|User_id|         profileName|review/score|review/time|      review/summary|         review/text|N_helpful|Tot_votes|
+--------------------+-----+-------+--------------------+------------+-----------+--------------------+--------------------+---------+---------+
|'Its Only Art If ...| null|   null|Jim of Oz "jim-of...|         4.0|  940636800|Nice collection o...|This is only for ...|        7|        7|
|'Dr. Seuss: Ameri...| null|   null|       Kevin Killian|         5.0| 1095724800|   Really Enjoyed It|I don't care much...|       10|       10|
|'Dr. Seuss: Ameri...| null|   null|        John Granger|         5.0| 1078790400|Essential for eve...|If people become ...|       10|       11|
|'Dr. Seuss: Ameri...| null|   null|Roy E. Perry "ama...|         4.0| 1090713600|Phlip Nel gives s...|Theodore Seuss Ge...|      



Accuracy: 0.8365845697638484


                                                                                

## Possible implementation to count the words (TO CHECK)

In [None]:
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType

# Assuming you've already trained the MNB model and have 'predictions' DataFrame

# Define a UDF to count positive words based on model predictions
def count_positive_words(predictions):
    positive_predictions = predictions.filter(predictions.prediction == 1)
    return len(positive_predictions)

# Register the UDF
count_positive_words_udf = udf(count_positive_words, IntegerType())

# Add a new column 'positive_word_count' to your DataFrame
df_with_positive_word_count = df.withColumn('positive_word_count', count_positive_words_udf(predictions))

# Show the DataFrame with the new column
df_with_positive_word_count.select("review/text", "positive_word_count").show()
