In [1]:
!pip install pyspark

Defaulting to user installation because normal site-packages is not writeable


In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

In [3]:
from pyspark import SparkContext

# Stop the existing SparkContext if any
SparkContext.stop(sc)

# Initialize a new SparkContext
sc = SparkContext(master="local", appName="New App")


In [4]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Customer Reviews Analysis") \
    .getOrCreate()

In [5]:
test_data = spark.read.csv("hdfs://localhost:9000/user/hduser/CustomerReviews/test/test.csv", header=True, inferSchema=True)
train_data = spark.read.csv("hdfs://localhost:9000/user/hduser/CustomerReviews/train/train.csv", header=True, inferSchema=True)


                                                                                

In [6]:
# Show first few rows
test_data.show()
train_data.show()

# Show schema
test_data.printSchema()
train_data.printSchema()

+---+--------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|  1|     mens ultrasheer|This model may be ok for sedentary types, but I'm active and get around alot in my job - consistently found these stockings rolled up down by my ankles! Not Good!! Solution: go with the standard compression stocking, 20-30, stock #114622. Excellent support, stays up and gives me what I need. Both pair of these also tore as I struggled to pull them up all the time. Good riddance/bad investment!|
+---+--------------------+------------------------------------------------------------------------------------------------------------------------------

In [7]:
from pyspark.ml.feature import Tokenizer, HashingTF, IDF

tokenizer = Tokenizer(inputCol="review_text", outputCol="tokens")
hashingTF = HashingTF(inputCol="tokens", outputCol="rawFeatures")
idf = IDF(inputCol="rawFeatures", outputCol="features")

In [8]:
test_data = spark.read.csv(
    "hdfs://localhost:9000/user/hduser/CustomerReviews/test/test.csv", 
    header=True, 
    inferSchema=True, 
    sep=",", 
    quote='"'
)


                                                                                

In [9]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression

# Initialize classifier
lr = LogisticRegression(labelCol="label", featuresCol="features")

# Create pipeline
pipeline = Pipeline(stages=[tokenizer, hashingTF, idf, lr])


In [10]:
# Rename columns in test_data and train_data
test_data = test_data.withColumnRenamed("1", "rating") \
                     .withColumnRenamed("mens ultrasheer", "title") \
                     .withColumnRenamed("This model may be ok for sedentary types, but I'm active and get around alot in my job - consistently found these stockings rolled up down by my ankles! Not Good!! Solution: go with the standard compression stocking, 20-30, stock #114622. Excellent support, stays up and gives me what I need. Both pair of these also tore as I struggled to pull them up all the time. Good riddance/bad investment!", "review_text")

train_data = train_data.withColumnRenamed('3', 'rating') \
                       .withColumnRenamed('more like funchuck', 'title') \
                       .withColumnRenamed('"Gave this to my dad for a gag gift after directing ""Nunsense', 'review_part1') \
                       .withColumnRenamed('"" he got a reall kick out of it!"', 'review_part2')



In [11]:
print(train_data.columns)

['rating', 'title', 'review_part1', 'review_part2']


In [12]:
print(test_data.columns)

['rating', 'title', 'review_text']


In [13]:
from pyspark.sql.functions import concat_ws

train_data = train_data.withColumn("review_text", concat_ws(" ", train_data.review_part1, train_data.review_part2))


In [14]:
train_data = train_data.withColumnRenamed("rating", "label")

In [15]:
pip install textblob

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [36]:
from pyspark.sql.functions import udf
from pyspark.sql.types import FloatType
# If you can install TextBlob, uncomment the next line
# from textblob import TextBlob

# Define a UDF to compute sentiment score
def sentiment_score(text):
    return TextBlob(text).sentiment.polarity  # If you can't install TextBlob, replace this line with your sentiment analysis logic

# Register the UDF
sentiment_score_udf = udf(sentiment_score, FloatType())

# Assuming your DataFrame is named 'train_data'
# Add sentiment score as a new column to the DataFrame
train_data_with_sentiment = train_data.withColumn("sentiment_score", sentiment_score_udf(train_data['review_text']))

# Show the DataFrame with the new column
print(train_data_with_sentiment)


DataFrame[label: int, title: string, review_part1: string, review_part2: string, review_text: string, sentiment_score: float]


In [17]:
from pyspark.ml.feature import VectorAssembler

# Define stages of the pipeline
tokenizer = Tokenizer(inputCol="review_text", outputCol="tokens")
hashingTF = HashingTF(inputCol="tokens", outputCol="rawFeatures")
idf = IDF(inputCol="rawFeatures", outputCol="tfidf_features")
assembler = VectorAssembler(
    inputCols=["tfidf_features", "sentiment_score"],
    outputCol="final_features"
)
lr = LogisticRegression(labelCol="label", featuresCol="final_features")

# Create pipeline
pipeline = Pipeline(stages=[tokenizer, hashingTF, idf, assembler, lr])




In [35]:
train_data_with_sentiment.printSchema()

root
 |-- label: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- review_part1: string (nullable = true)
 |-- review_part2: string (nullable = true)
 |-- review_text: string (nullable = false)
 |-- sentiment_score: float (nullable = true)



In [19]:
pip install textblob

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [20]:
!pip install textblob

Defaulting to user installation because normal site-packages is not writeable


In [21]:
from textblob import TextBlob

In [22]:
from pyspark.sql.functions import udf
from pyspark.sql.types import FloatType

def sentiment_score(text):
    from textblob import TextBlob  # Importing inside the UDF
    return TextBlob(text).sentiment.polarity

sentiment_score_udf = udf(sentiment_score, FloatType())

In [24]:
import site
import textblob

# Get the list of all site-packages directories
site_packages = site.getsitepackages()

# Find the path where textblob is installed
textblob_path = textblob.__path__[0]

# Loop over all site-packages to find where textblob is installed
for site_package in site_packages:
    if textblob_path.startswith(site_package):
        print(f"TextBlob is installed in: {site_package}")


In [25]:
from pyspark import SparkConf, SparkContext

conf = SparkConf().setAppName("MyApp").set('spark.executorEnv.PYTHONPATH', '/path/to/python/interpreter')
sc = SparkContext.getOrCreate(conf)

In [26]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('basic_spark').getOrCreate()
sc = spark.sparkContext

data = [1, 2, 3, 4, 5]
distData = sc.parallelize(data)

result = distData.reduce(lambda a, b: a + b)
print("Result:", result)


Result: 15


In [27]:
import os
os.environ['PYSPARK_PYTHON'] = '/usr/bin/python3.10' 


In [28]:
from pyspark.sql import SparkSession

def python_version(x):
    import sys
    return sys.version

spark = SparkSession.builder.master("local[1]").appName('SparkApp').getOrCreate()
print(spark.sparkContext.runJob(spark.sparkContext.parallelize([1]), python_version))


['3', '.', '1', '0', '.', '1', '2', ' ', '(', 'm', 'a', 'i', 'n', ',', ' ', 'J', 'u', 'n', ' ', '1', '1', ' ', '2', '0', '2', '3', ',', ' ', '0', '5', ':', '2', '6', ':', '2', '8', ')', ' ', '[', 'G', 'C', 'C', ' ', '1', '1', '.', '4', '.', '0', ']']


In [29]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Sentiment Analysis") \
    .getOrCreate()

df = spark.createDataFrame([(1, "I love learning"), (2, "I hate bugs")], ["id", "text"])

df_with_sentiment = df.withColumn("sentiment", sentiment_score_udf(df["text"]))
df_with_sentiment.show()


[Stage 10:>                                                         (0 + 1) / 1]

+---+---------------+---------+
| id|           text|sentiment|
+---+---------------+---------+
|  1|I love learning|      0.5|
|  2|    I hate bugs|     -0.8|
+---+---------------+---------+



                                                                                

In [30]:
def sentiment_score(text):
    try:
        from textblob import TextBlob  # Importing inside the UDF
        return TextBlob(text).sentiment.polarity
    except Exception as e:
        return str(e)  # Return the exception for debugging

sentiment_score_udf = udf(sentiment_score, FloatType())

In [31]:
from pyspark.sql.functions import udf
from pyspark.sql.types import FloatType

def sentiment_score(text):
    from textblob import TextBlob  # Explicit import
    return TextBlob(text).sentiment.polarity

sentiment_score_udf = udf(sentiment_score, FloatType())

In [38]:
from pyspark.sql.functions import udf
from pyspark.sql.types import FloatType
# If you can install TextBlob, uncomment the next line
# from textblob import TextBlob

# Define a UDF to compute sentiment score
def sentiment_score(text):
    return TextBlob(text).sentiment.polarity  # If you can't install TextBlob, replace this line with your sentiment analysis logic

# Register the UDF
sentiment_score_udf = udf(sentiment_score, FloatType())

# Assuming your DataFrame is named 'train_data'
# Add sentiment score as a new column to the DataFrame
train_data_with_sentiment = train_data.withColumn("sentiment_score", sentiment_score_udf(train_data['review_text']))

# Show the DataFrame with the new column
print(train_data_with_sentiment)

DataFrame[label: int, title: string, review_part1: string, review_part2: string, review_text: string, sentiment_score: float]


In [41]:
spark = SparkSession.builder \
    .config("spark.network.timeout", "10800s") \
    .config("spark.executor.heartbeatInterval", "3600s") \
    .master("local[1]") \
    .appName('SparkApp') \
    .getOrCreate()

ConnectionRefusedError: [Errno 111] Connection refused

In [39]:
from pyspark.ml.feature import VectorAssembler

# Define stages of the pipeline
tokenizer = Tokenizer(inputCol="review_text", outputCol="tokens")
hashingTF = HashingTF(inputCol="tokens", outputCol="rawFeatures")
idf = IDF(inputCol="rawFeatures", outputCol="tfidf_features")
assembler = VectorAssembler(
    inputCols=["tfidf_features", "sentiment_score"],
    outputCol="final_features"
)
lr = LogisticRegression(labelCol="label", featuresCol="final_features")

# Create pipeline
pipeline = Pipeline(stages=[tokenizer, hashingTF, idf, assembler, lr])




In [40]:
model = pipeline.fit(train_data_with_sentiment)

2023-09-20 21:14:50,897 WARN scheduler.DAGScheduler: Broadcasting large task binary with size 20.2 MiB
2023-09-20 21:27:53,692 WARN scheduler.DAGScheduler: Broadcasting large task binary with size 20.2 MiB
2023-09-20 21:27:55,993 WARN scheduler.DAGScheduler: Broadcasting large task binary with size 20.2 MiB
2023-09-20 21:30:06,520 WARN memory.MemoryStore: Not enough space to cache rdd_119_1 in memory! (computed 113.1 MiB so far)
2023-09-20 21:30:06,529 WARN storage.BlockManager: Persisting block rdd_119_1 to disk instead.
2023-09-20 21:30:37,945 WARN memory.MemoryStore: Not enough space to cache rdd_119_1 in memory! (computed 113.1 MiB so far)
2023-09-20 21:31:29,852 WARN memory.MemoryStore: Not enough space to cache rdd_119_2 in memory! (computed 113.1 MiB so far)
2023-09-20 21:31:29,853 WARN storage.BlockManager: Persisting block rdd_119_2 to disk instead.
2023-09-20 21:32:00,430 WARN memory.MemoryStore: Not enough space to cache rdd_119_2 in memory! (computed 113.1 MiB so far)
2023-

2023-09-20 21:47:27,951 WARN memory.MemoryStore: Not enough space to cache rdd_119_4 in memory! (computed 113.1 MiB so far)
2023-09-20 21:47:48,193 WARN memory.MemoryStore: Not enough space to cache rdd_119_5 in memory! (computed 113.0 MiB so far)
2023-09-20 21:48:04,975 WARN memory.MemoryStore: Not enough space to cache rdd_119_6 in memory! (computed 113.1 MiB so far)
2023-09-20 21:48:26,540 WARN memory.MemoryStore: Not enough space to cache rdd_119_7 in memory! (computed 113.1 MiB so far)
2023-09-20 21:48:47,370 WARN memory.MemoryStore: Not enough space to cache rdd_119_8 in memory! (computed 113.1 MiB so far)
2023-09-20 21:49:08,534 WARN memory.MemoryStore: Not enough space to cache rdd_119_9 in memory! (computed 113.1 MiB so far)
2023-09-20 21:49:30,408 WARN scheduler.DAGScheduler: Broadcasting large task binary with size 20.2 MiB
2023-09-20 21:49:33,315 WARN storage.BlockManager: Asked to remove block broadcast_47_piece3, which does not exist
2023-09-20 21:49:33,315 WARN storage.B

	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.foldLeft(TraversableOnce.scala:199)
	at scala.collection.TraversableOnce.foldLeft$(TraversableOnce.scala:192)
	at org.apache.spark.InterruptibleIterator.foldLeft(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.aggregate(TraversableOnce.scala:260)
	at scala.collection.TraversableOnce.aggregate$(TraversableOnce.scala:260)
	at org.apache.spark.InterruptibleIterator.aggregate(InterruptibleIterator.scala:28)
	at org.apache.spark.rdd.RDD.$anonfun$treeAggregate$3(RDD.scala:1230)
	at org.apache.spark.rdd.RDD$$Lambda$3228/69611501.apply(Unknown Source)
	at org.apache.spark.rdd.RDD.$anonfun$treeAggregate$5(RDD.scala:1231)
	at org.apache.spark.rdd.RDD$$Lambda$3229/154231387.apply(Unknown Source)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartiti

ConnectionRefusedError: [Errno 111] Connection refused