In [83]:
!pip install pyspark



In [84]:
# Import SparkSession
from pyspark.sql import SparkSession

In [85]:
# Initialize SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

In [86]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [87]:
df = spark.read.text('/content/drive/MyDrive/dataset_project/2010 Movies Plot.txt')
df.show()

+--------------------+
|               value|
+--------------------+
|Michael and Madis...|
|While on the phon...|
|When a teenager l...|
|When a high-stake...|
|In March 2016, af...|
|The film closely ...|
|Expert bank robbe...|
|The film begins w...|
|On the day of the...|
|As a series of mi...|
|Dr. Rebecca Churc...|
|Paranormal invest...|
|Cory Weissman was...|
|The plot revolves...|
|Joseph Crone wake...|
|Bree decides to g...|
|Paramedic Nick Ma...|
|Detective Tyler B...|
|Mitch Nelson, a U...|
|left|thumb|The Ed...|
+--------------------+
only showing top 20 rows



In [89]:
# Convert DataFrame to RDD of strings
lines = df.rdd.map(lambda row: row.value)

In [79]:
# Tokenize the text into words
words = lines.flatMap(lambda line: line.split())

In [90]:
# Create a word frequency vector
word_freq = words.map(lambda word: (word.lower(), 1)).reduceByKey(lambda a, b: a + b)

In [91]:
# Filter out common words
common_words = set(["us","has","all", "they", "from",
"who","what","on","by","more","as","not","their","can","new","it","but","be","are","--",  "i","have","this","will","for","with","is","that","in","our","we","a","of",  "to","and","the","that's","or","make","do","you","at","it's","than","if",
"know","last","about","no","just","now","an","because","<p>we","why","we'll", "how","two","also","every","come","we've","year","over","get","take","one", "them","we're","need","want","when","like","most","-", "been","first","where","so","these","they're","good","would","there","should","-->",
"<!--","up","i'm","his","their","which","may","were","such","some","those","was", "here","she","he","its","her","his","don't","i've","what's","didn't","shouldn't", "(applause.)","let's","doesn't"])

In [82]:
filtered_word_freq = word_freq.filter(lambda pair: pair[0] not in common_words)

In [72]:
# Sort the word frequency vector by count
sorted_word_freq = filtered_word_freq.sortBy(lambda pair: pair[1], ascending=False)

In [73]:
sorted_word_freq.collect()

[('him', 2892),
 ('after', 1917),
 ('into', 1544),
 ('out', 1402),
 ('while', 1238),
 ('tells', 946),
 ('then', 887),
 ('before', 859),
 ('back', 812),
 ('had', 723),
 ('being', 659),
 ('finds', 656),
 ('him.', 629),
 ('find', 624),
 ('only', 549),
 ('other', 543),
 ('off', 540),
 ('go', 531),
 ('during', 522),
 ('takes', 522),
 ('goes', 501),
 ('her.', 498),
 ('house', 463),
 ('next', 461),
 ('police', 459),
 ('time', 451),
 ('having', 449),
 ('father', 448),
 ('reveals', 445),
 ('family', 444),
 ('through', 444),
 ('tries', 441),
 ('home', 441),
 ('help', 429),
 ('mother', 428),
 ('man', 428),
 ('gets', 409),
 ('kill', 402),
 ('down', 401),
 ('group', 400),
 ('however,', 398),
 ('asks', 391),
 ('begins', 388),
 ('another', 381),
 ('later,', 374),
 ('meets', 370),
 ('him,', 366),
 ('both', 366),
 ('her,', 356),
 ('becomes', 353),
 ('leaves', 353),
 ('car', 347),
 ('life', 346),
 ('later', 343),
 ('himself', 331),
 ('killed', 322),
 ('attempts', 320),
 ('says', 318),
 ('kills', 316),
 

In [None]:
# Stop SparkSession
spark.stop()