In [2]:
import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, split, explode


# Create a new optimized Spark session
spark = SparkSession.builder.appName("Model") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "4g") \
    .config("spark.memory.offHeap.enabled", "true") \
    .config("spark.memory.offHeap.size", "2g") \
    .config("spark.sql.shuffle.partitions", "20") \
    .config("spark.default.parallelism", "20") \
    .config("spark.kryoserializer.buffer.max", "512m") \
    .getOrCreate()

spark.conf.set('spark.sql.caseSensitive', True)

In [3]:
user_reviews= spark.read.parquet("Dataset/user_reviews.parquet")

In [4]:
user_reviews.show(5, False)

+------------+--------------------------------------------------------+------+------------+----------------------------+----------+-----------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------+------------+-----------------+-------------------+
|review_id   |ti

In [5]:
user_reviews_demo = user_reviews.limit(10000)

In [6]:
user_reviews_demo.dtypes

[('review_id', 'bigint'),
 ('title', 'string'),
 ('rating', 'double'),
 ('rating_label', 'string'),
 ('user_id', 'string'),
 ('asin', 'string'),
 ('parent_asin', 'string'),
 ('text', 'string'),
 ('text_length', 'int'),
 ('helpful_vote', 'bigint'),
 ('verified_purchase', 'boolean'),
 ('datetime', 'string')]

In [7]:
metadata=spark.read.parquet("Dataset/metadata.parquet")
metadata.show(5, False)

+-----------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------+-------------+-------------------------+-------------------------------------------------------------------+-----+-----------+
|parent_asin|title                                                                                                                                                                                                   |average_rating|rating_number|main_category            |categories                                                         |price|store      |
+-----------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------+-------------+-------------------------+-------------------

In [8]:
complete_df = user_reviews_demo.join(metadata, user_reviews_demo.parent_asin == metadata.parent_asin, "inner")

In [10]:
hybrid_df = complete_df.select(
    user_reviews_demo["review_id"],
    user_reviews_demo["title"].alias("review_title"),
    metadata["title"].alias("product_title"),
    user_reviews_demo["rating"],
    metadata["average_rating"],
    metadata["rating_number"],
    user_reviews_demo["asin"].alias("item_id"),
    metadata["parent_asin"],
    user_reviews_demo["user_id"],
    user_reviews_demo["helpful_vote"],
    metadata["categories"],
    user_reviews_demo["text"].alias("review_text"),
    user_reviews_demo["rating_label"]
)

In [11]:
hybrid_df.printSchema()

root
 |-- review_id: long (nullable = true)
 |-- review_title: string (nullable = true)
 |-- product_title: string (nullable = true)
 |-- rating: double (nullable = true)
 |-- average_rating: double (nullable = true)
 |-- rating_number: long (nullable = true)
 |-- item_id: string (nullable = true)
 |-- parent_asin: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- helpful_vote: long (nullable = true)
 |-- categories: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- review_text: string (nullable = true)
 |-- rating_label: string (nullable = true)



## Recommendation model


In [16]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, CountVectorizer, HashingTF, IDF, Tokenizer, StopWordsRemover, Word2Vec
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator, BinaryClassificationEvaluator
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier
from pyspark.ml.clustering import KMeans
from pyspark.sql.functions import col, udf, explode, array_join, concat, lit, when, regexp_replace, collect_list
from pyspark.sql.types import FloatType, ArrayType, StringType, DoubleType
from pyspark.ml.linalg import Vectors, VectorUDT, DenseVector
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder



In [13]:
# Simplified hybrid model with lower memory footprint
def build_hybrid_recommender(user_reviews_df, metadata_df, sample_size=10000):
    # 1. Sample data to reduce memory requirements
    reviews_sample = user_reviews_df.limit(sample_size)
    
    # 2. Join data more efficiently - select only necessary columns first
    reviews_sample = reviews_sample.select(
        "review_id", "user_id", "asin", "parent_asin", "rating", 
        "title", "text", "rating_label"
    )
    
    metadata_sample = metadata_df.select(
        "parent_asin", "title", "average_rating", "rating_number", "categories"
    )
    
    # Join with broadcast to optimize memory
    from pyspark.sql.functions import broadcast
    complete_df = reviews_sample.join(
        broadcast(metadata_sample), 
        reviews_sample.parent_asin == metadata_sample.parent_asin,
        "inner"
    )
    
    # 3. Simple sentiment analysis based on existing rating_label
    # Instead of building a complex model, use the existing rating_label
    from pyspark.sql.functions import when
    sentiment_df = complete_df.withColumn(
        "sentiment_score", 
        when(col("rating_label") == "positive", 1.0)
        .when(col("rating_label") == "negative", 0.0)
        .otherwise(0.5)
    )
    
    # 4. Basic content-based features from categories
    # Extract category features without tokenization
    from pyspark.sql.functions import explode, collect_set
    category_df = sentiment_df.select(
        "asin", explode("categories").alias("category")
    ).groupBy("asin").agg(
        collect_set("category").alias("category_features")
    )
    
    # 5. Simplified collaborative filtering
    # Prepare data with fewer transformations
    user_indexer = StringIndexer(inputCol="user_id", outputCol="user_index", handleInvalid="skip")
    item_indexer = StringIndexer(inputCol="asin", outputCol="item_index", handleInvalid="skip")
    
    rating_data = sentiment_df.select("user_id", "asin", "rating")
    rating_data = user_indexer.fit(rating_data).transform(rating_data)
    rating_data = item_indexer.fit(rating_data).transform(rating_data)
    
    # Use lower rank and fewer iterations
    als = ALS(
        maxIter=5,
        regParam=0.1,
        rank=10,
        userCol="user_index",
        itemCol="item_index",
        ratingCol="rating",
        coldStartStrategy="drop",
        nonnegative=True,
        implicitPrefs=False
    )
    
    # Set checkpoint directory to help with memory issues
    sc = spark.sparkContext
    sc.setCheckpointDir("/tmp/checkpoint")
    
    # Train model with checkpointing
    rating_data.checkpoint()
    als_model = als.fit(rating_data)
    
    # 6. Generate recommendations - get top 5 instead of 10
    user_recs = als_model.recommendForAllUsers(5)
    
    return als_model, user_recs

In [15]:
# Run the simplified model
try:
    als_model, user_recs = build_hybrid_recommender(user_reviews, metadata, sample_size=10000)
    
    # Show top recommendations
    user_recs_exploded = user_recs.withColumn("recommendations", explode("recommendations"))
    recommendations_df = user_recs_exploded.select(
        col("user_index"),
        col("recommendations.item_index").alias("item_index"),
        col("recommendations.rating").alias("predicted_rating")
    )
    
    # Display results
    recommendations_df.show(5)
    
    print("Model completed successfully!")
except Exception as e:
    print(f"Error: {e}")
    
    # Offer diagnostic information
    print("\nDiagnostic Information:")
    print("1. Check your system resources (try running 'free -h' in terminal)")
    print("2. Spark UI should be available at http://localhost:4040 for debugging")
    print("3. Try reducing sample_size further if still encountering issues")

+----------+----------+----------------+
|user_index|item_index|predicted_rating|
+----------+----------+----------------+
|         1|       483|        6.103487|
|         1|       846|       5.8908634|
|         1|      5672|        5.872067|
|         1|      4635|        5.872067|
|         1|      4594|        5.872067|
+----------+----------+----------------+
only showing top 5 rows

Model completed successfully!
