In [1]:
!pip install pyspark

Defaulting to user installation because normal site-packages is not writeable


In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

In [3]:
from pyspark import SparkContext

# Stop the existing SparkContext if any
SparkContext.stop(sc)

# Initialize a new SparkContext
sc = SparkContext(master="local", appName="New App")


In [4]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Customer Reviews Analysis") \
    .getOrCreate()

In [5]:
test_data = spark.read.csv("hdfs://localhost:9000/user/hduser/CustomerReviews/test/test.csv", header=True, inferSchema=True)
train_data = spark.read.csv("hdfs://localhost:9000/user/hduser/CustomerReviews/train/train.csv", header=True, inferSchema=True)


                                                                                

In [6]:
# Show first few rows
test_data.show()
train_data.show()

# Show schema
test_data.printSchema()
train_data.printSchema()

+---+--------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|  1|     mens ultrasheer|This model may be ok for sedentary types, but I'm active and get around alot in my job - consistently found these stockings rolled up down by my ankles! Not Good!! Solution: go with the standard compression stocking, 20-30, stock #114622. Excellent support, stays up and gives me what I need. Both pair of these also tore as I struggled to pull them up all the time. Good riddance/bad investment!|
+---+--------------------+------------------------------------------------------------------------------------------------------------------------------

In [7]:
from pyspark.ml.feature import Tokenizer, HashingTF, IDF

tokenizer = Tokenizer(inputCol="review_text", outputCol="tokens")
hashingTF = HashingTF(inputCol="tokens", outputCol="rawFeatures")
idf = IDF(inputCol="rawFeatures", outputCol="features")

In [8]:
test_data = spark.read.csv(
    "hdfs://localhost:9000/user/hduser/CustomerReviews/test/test.csv", 
    header=True, 
    inferSchema=True, 
    sep=",", 
    quote='"'
)


                                                                                

In [9]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression

# Initialize classifier
lr = LogisticRegression(labelCol="label", featuresCol="features")

# Create pipeline
pipeline = Pipeline(stages=[tokenizer, hashingTF, idf, lr])


In [10]:
# Rename columns in test_data and train_data
test_data = test_data.withColumnRenamed("1", "rating") \
                     .withColumnRenamed("mens ultrasheer", "title") \
                     .withColumnRenamed("This model may be ok for sedentary types, but I'm active and get around alot in my job - consistently found these stockings rolled up down by my ankles! Not Good!! Solution: go with the standard compression stocking, 20-30, stock #114622. Excellent support, stays up and gives me what I need. Both pair of these also tore as I struggled to pull them up all the time. Good riddance/bad investment!", "review_text")

train_data = train_data.withColumnRenamed('3', 'rating') \
                       .withColumnRenamed('more like funchuck', 'title') \
                       .withColumnRenamed('"Gave this to my dad for a gag gift after directing ""Nunsense', 'review_part1') \
                       .withColumnRenamed('"" he got a reall kick out of it!"', 'review_part2')



In [11]:
print(train_data.columns)

['rating', 'title', 'review_part1', 'review_part2']


In [12]:
print(test_data.columns)

['rating', 'title', 'review_text']


In [13]:
from pyspark.sql.functions import concat_ws

train_data = train_data.withColumn("review_text", concat_ws(" ", train_data.review_part1, train_data.review_part2))


In [14]:
train_data = train_data.withColumnRenamed("rating", "label")

In [16]:
pip install textblob

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [18]:
from pyspark.sql.functions import udf
from pyspark.sql.types import FloatType
# If you can install TextBlob, uncomment the next line
# from textblob import TextBlob

# Define a UDF to compute sentiment score
def sentiment_score(text):
    return TextBlob(text).sentiment.polarity  # If you can't install TextBlob, replace this line with your sentiment analysis logic

# Register the UDF
sentiment_score_udf = udf(sentiment_score, FloatType())

# Assuming your DataFrame is named 'train_data'
# Add sentiment score as a new column to the DataFrame
train_data_with_sentiment = train_data.withColumn("sentiment_score", sentiment_score_udf(train_data['review_text']))

# Show the DataFrame with the new column
print(train_data_with_sentiment)


DataFrame[label: int, title: string, review_part1: string, review_part2: string, review_text: string, sentiment_score: float]


In [19]:
from pyspark.ml.feature import VectorAssembler

# Define stages of the pipeline
tokenizer = Tokenizer(inputCol="review_text", outputCol="tokens")
hashingTF = HashingTF(inputCol="tokens", outputCol="rawFeatures")
idf = IDF(inputCol="rawFeatures", outputCol="tfidf_features")
assembler = VectorAssembler(
    inputCols=["tfidf_features", "sentiment_score"],
    outputCol="final_features"
)
lr = LogisticRegression(labelCol="label", featuresCol="final_features")

# Create pipeline
pipeline = Pipeline(stages=[tokenizer, hashingTF, idf, assembler, lr])




In [26]:
train_data_with_sentiment.printSchema()

root
 |-- label: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- review_part1: string (nullable = true)
 |-- review_part2: string (nullable = true)
 |-- review_text: string (nullable = false)
 |-- sentiment_score: float (nullable = true)

