In [1]:
import pip
pip.main(['install', 'findspark'])

Please see https://github.com/pypa/pip/issues/5599 for advice on fixing the underlying issue.
To avoid this problem you can invoke Python with '-m pip' instead of running pip directly.


0

In [2]:
import findspark
findspark.init()
findspark.find()

'c:\\Users\\ammar\\anaconda3\\Lib\\site-packages\\pyspark'

In [3]:
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
sc=SparkContext.getOrCreate()
spark=SparkSession(sc)

In [4]:
#Task1
my_list = list(range(1, 11))

print("MyList:", my_list)

rdd = spark.sparkContext.parallelize(my_list)

squared_rdd = rdd.map(lambda x: x**2)

squared_list = squared_rdd.collect()
print("Squared List:", squared_list)



MyList: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
Squared List: [1, 4, 9, 16, 25, 36, 49, 64, 81, 100]


In [5]:
#Task2
import random
my_list_2 = [random.randint(1, 100) for _ in range(20)]
print("MyList 2:", my_list_2)

rdd_2 = spark.sparkContext.parallelize(my_list_2)

divisible_by_5_rdd = rdd_2.filter(lambda x: x % 5 == 0)


divisible_by_5_list = divisible_by_5_rdd.collect()
print("Numbers Divisible by 5:", divisible_by_5_list)


MyList 2: [53, 35, 33, 30, 55, 11, 90, 3, 40, 84, 78, 35, 84, 43, 11, 16, 69, 47, 17, 38]
Numbers Divisible by 5: [35, 30, 55, 90, 40, 35]


In [6]:
# Task3
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
from pyspark.sql.functions import *

# Create a SparkSession
spark = SparkContext.getOrCreate()
spark = SparkSession(spark)

# Load the user comments file
comments = spark.read.csv("user_comment.csv", header=True)

## CSV file is made from provided manual

In [7]:
# find comments given by each user
from pyspark.sql.functions import collect_list
user_comments = comments.groupBy("UserName").agg(collect_list("Comment").alias("Comments"))
user_comments.show(truncate=False)

+--------+-----------------------------------------+
|UserName|Comments                                 |
+--------+-----------------------------------------+
|Ali45   |[Good !!!, I will definitely visit again]|
|Aliya153|[Your website is superb]                 |
|Sara2   |[You need to work on your website design]|
+--------+-----------------------------------------+



In [8]:
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType

# Define a UDF to count the number of long comments
def count_long_comments(comments):
    return len([char for char in comments if len(char) > 20])

# make the UDF available in Spark
count_long_comments_udf = udf(count_long_comments, IntegerType())

# Apply the UDF to the DataFrame
long_comments = user_comments.withColumn("NumLongComments", count_long_comments_udf(user_comments["Comments"]))

# Collect the data and print each row
long_comments.show(truncate=False)

+--------+-----------------------------------------+---------------+
|UserName|Comments                                 |NumLongComments|
+--------+-----------------------------------------+---------------+
|Ali45   |[Good !!!, I will definitely visit again]|1              |
|Aliya153|[Your website is superb]                 |1              |
|Sara2   |[You need to work on your website design]|1              |
+--------+-----------------------------------------+---------------+



In [9]:
# Count the number of UserNames starting with each English alphabet
user_names_by_first_letter = comments.groupBy(substring(col("UserName"), 1, 1).alias("FirstLetter")).count()
user_names_by_first_letter.show(truncate=False)

+-----------+-----+
|FirstLetter|count|
+-----------+-----+
|A          |3    |
|S          |1    |
+-----------+-----+



In [10]:
# Find the user who has given the maximum number of comments
most_comments_user = user_comments.orderBy(desc(size("Comments"))).first()
username=most_comments_user.UserName
print ("User with most comments:", username)

User with most comments: Ali45


In [11]:
# Task4
# Remove stop words from the comments of the users

# get stopwords list from nltk and store in 'stopwords.txt'
from nltk.corpus import stopwords

stopwords_list = stopwords.words('english')

with open('stop_words.txt', 'w') as filehandle:
    filehandle.writelines("%s\n" % word for word in stopwords_list)
    
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col
from pyspark.sql.types import StringType

# Create a Spark session
spark = SparkSession.builder.appName("RemoveStopWords").getOrCreate()

# Read the CSV file into a DataFrame (replace with your actual CSV file path)
csv_file = "user_comment.csv"
user_comments = spark.read.csv(csv_file, header=True)

# Read the text file containing stop words
stop_words_file = "stop_words.txt"
with open(stop_words_file, "r") as f:
    stop_words = f.read().splitlines()

# Broadcast the stop words
broadcast_stop_words = spark.sparkContext.broadcast(stop_words)

# Define a function to remove stop words
def remove_stop_words(comment):
    stop_words = broadcast_stop_words.value
    return " ".join([word for word in comment.split() if word.lower() not in stop_words])

# Register the UDF
remove_stop_words_udf = udf(remove_stop_words, StringType())

# Apply the UDF to the 'Comment' column
user_comments_without_stop_words = user_comments.withColumn("CleanedComment", remove_stop_words_udf(col("Comment")))

# Show the result
user_comments_without_stop_words.show(truncate=False)

# Stop the Spark session
spark.stop()


+--------+---------------------------------------+------------------------+
|UserName|Comment                                |CleanedComment          |
+--------+---------------------------------------+------------------------+
|Aliya153|Your website is superb                 |website superb          |
|Sara2   |You need to work on your website design|need work website design|
|Ali45   |Good !!!                               |Good !!!                |
|Ali45   |I will definitely visit again          |definitely visit        |
+--------+---------------------------------------+------------------------+

