In [0]:
!pip install nltk

Collecting nltk
  Using cached nltk-3.9.1-py3-none-any.whl (1.5 MB)
Collecting regex>=2021.8.3
  Using cached regex-2024.9.11-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (781 kB)
Collecting tqdm
  Using cached tqdm-4.66.5-py3-none-any.whl (78 kB)
Installing collected packages: tqdm, regex, nltk
Successfully installed nltk-3.9.1 regex-2024.9.11 tqdm-4.66.5
You should consider upgrading via the '/local_disk0/.ephemeral_nfs/envs/pythonEnv-21dfce04-674b-4e48-9702-e9dffec05cc4/bin/python -m pip install --upgrade pip' command.[0m


In [0]:
import numpy as np

In [0]:
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Out[3]: True

In [0]:
# reading data 
movies = spark.read.format('csv').option('delimiter', '\t').option('header', 'false').load(r'dbfs:/FileStore/tables/movie_metadata.tsv')
data = movies.collect()

In [0]:
# converting the list to data frame
movie_df = movies.select('_c0','_c2').toDF('ID', 'Name')

In [0]:
# displaying the movie IDs and names
display(movie_df)

ID,Name
975900,Ghosts of Mars
3196793,Getting Away with Murder: The JonBenét Ramsey Mystery
28463795,Brun bitter
9363483,White Of The Eye
261236,A Woman in Flames
13696889,The Gangsters
18998739,The Sorcerer's Apprentice
10408933,Alexander's Ragtime Band
9997961,Contigo y aquí
2345652,City of the Dead


In [0]:
# getting the plot summaries
summaries = sc.textFile("/FileStore/tables/plot_summaries.txt")

In [0]:
# printing the first 10 lines of summary
summaries.take(10)

Out[8]: ["23890098\tShlykov, a hard-working taxi driver and Lyosha, a saxophonist, develop a bizarre love-hate relationship, and despite their prejudices, realize they aren't so different after all.",
 '31186339\tThe nation of Panem consists of a wealthy Capitol and twelve poorer districts. As punishment for a past rebellion, each district must provide a boy and girl  between the ages of 12 and 18 selected by lottery  for the annual Hunger Games. The tributes must fight to the death in an arena; the sole survivor is rewarded with fame and wealth. In her first Reaping, 12-year-old Primrose Everdeen is chosen from District 12. Her older sister Katniss volunteers to take her place. Peeta Mellark, a baker\'s son who once gave Katniss bread when she was starving, is the other District 12 tribute. Katniss and Peeta are taken to the Capitol, accompanied by their frequently drunk mentor, past victor Haymitch Abernathy. He warns them about the "Career" tributes who train intensively at special 

In [0]:
# define functions and stopwords for text cleaning
stop_words = set(stopwords.words('english'))

# function for text preprocessing
def preprocess_text(text):
    text = str(text).lower()
    if text:
        words = text.split()
        preporcessed_wordlist = [word for word in words if word.isalpha() and word not in stop_words]
        return ' '.join(preporcessed_wordlist)
    else:
        return ''

In [0]:
# getting a touple of ID and summary for the movie
summaries_preprocessed = summaries.map(lambda x : x.split('\t')).map(lambda x : (x[0], preprocess_text(x[1])))
summaries_preprocessed.collect()

Out[10]: [('23890098', 'taxi driver develop bizarre despite realize different'),
 ('31186339',
  'nation panem consists wealthy capitol twelve poorer punishment past district must provide boy girl ages selected lottery annual hunger tributes must fight death sole survivor rewarded fame first primrose everdeen chosen district older sister katniss volunteers take peeta son gave katniss bread district katniss peeta taken accompanied frequently drunk past victor haymitch warns tributes train intensively special academies almost always tv interview caesar peeta unexpectedly reveals love believing ploy gain audience may provide gifts discovers peeta meant televised games begin half tributes killed first katniss barely survives ignoring advice run away melee tempting supplies weapons strewn front structure called peeta forms uneasy alliance four later find katniss corner hiding nearby draws attention poisonous tracker jacker nest hanging katniss drops sleeping except killed hallucinating due 

In [0]:
summaries_df = summaries_preprocessed.toDF(["ID","Summary"])

In [0]:
display(summaries_df)

ID,Summary
23890098,taxi driver develop bizarre despite realize different
31186339,nation panem consists wealthy capitol twelve poorer punishment past district must provide boy girl ages selected lottery annual hunger tributes must fight death sole survivor rewarded fame first primrose everdeen chosen district older sister katniss volunteers take peeta son gave katniss bread district katniss peeta taken accompanied frequently drunk past victor haymitch warns tributes train intensively special academies almost always tv interview caesar peeta unexpectedly reveals love believing ploy gain audience may provide gifts discovers peeta meant televised games begin half tributes killed first katniss barely survives ignoring advice run away melee tempting supplies weapons strewn front structure called peeta forms uneasy alliance four later find katniss corner hiding nearby draws attention poisonous tracker jacker nest hanging katniss drops sleeping except killed hallucinating due tracker jacker katniss warned run away rue cares katniss couple days alliance gathered supplies katniss rue draw destroys stockpile setting mines planted around cato kills boy assigned guard katniss runs hears rue calling finds rue trapped releases tribute district throws spear dodges causing stab rue stomach katniss shoots dead comforts dying rue gathers arranges flowers around sparks riot district president snow summons seneca express displeasure way games turning since katniss peeta presented public haymitch able convince crane make rule change avoid inciting announced tributes district win upon hearing katniss searches peeta finds infected sword wound portrays deeply love gains gift announcer proclaims thing survivor needs peeta begs risk getting katniss promises falls heads clove ambushes pins clove district kills clove overhearing tormenting katniss killing spares katniss medicine keeping peeta girl district dies eating nightlock berries stole neither knew highly crane changes time day arena late night unleashes pack creatures speed things kill thresh force katniss peeta flee roof encounter katniss wounds cato arrow peeta hurls creatures katniss shoots cato spare prolonged peeta katniss apparently rule change allowing two winners suddenly peeta tells katniss shoot gives half commit hastily proclaimed victors hunger haymitch warns katniss made powerful enemies display peeta return district crane locked room bowl nightlock president snow considers
20663735,poovalli induchoodan sentenced six years prison life murdering son justice maranchery karunakara menon framed case manapally madhavan nambiar crony dysp sankaranarayanan take revenge idealist judge menon earlier given jail sentence manapally corruption achieved top rank indian civil service loses post manapally sudheeran enters list civil service learn flashback ramakrishnan son moopil nair actually killed six years passes manapally madhavan former state dead rage gross injustice meted thus destroying promising released induchoodan thwarts manapally pavithran performing funeral rituals nambiar many confrontations induchoodan henchmen induchoodan also falls love anuradha daughter mooppil justice menon wife returns back kerala stay appearance girl named indulekha claims daughter justice menon flatly refuses claim banishes forced circumstances instigation help manapally reluctantly come open induchoodan first thrashes upon knowing truth chandrabhanu accepts task protection capacity elder induchoodan decides marry indulekha good friend jayakrishnan induchoodan confrontation father prods accept mistake acknowledge parentage menon ultimately regrets goes confess next induchoodan returns indulekha found dead menon accused murdering whole act planned killing forces raman nair testify menon nandagopal maarar close friend induchoodan famous supreme court appears menon manages lay bare murder plot hidden intentions party menon judged innocent crime confronting pavithran promising retribution crime killing induchoodan returns shows remorse actions including believing innocence speaking menon suffers heart stroke passes manapally pavithran arrives poke fun induchoodan also tries carry postponed last rituals induchoodan interrupts ritual avenges death sister father severely injuring way back peaceful induchoodan accepts anuradha life
2231378,lemon drop kid new york city illegally touting horses florida several successful kid comes across woman intending bet lot kid convinces switch employing prefabricated unfortunately woman notorious gangster moose moran choice finishes dead last furious moran demands kid provide christmas kid make new kid decides return new york try come first tries girlfriend brainy baxter talk commitment kid quickly makes next visits local crime boss charley past falls charley serious tax trouble particularly care kid leaves establishment give kid notices cornerside santa claus thinking kid fashions santa suit begins collecting fails recognized passing remembers previous underhanded activity kid lands convicted collecting charity without license sentenced ten days jail kid learns scheme went short brainy arrives bail sets restarting santa time legitimate needs charity represent city kid receives key inspiration remembers nellie thursday kindly neighborhood denied entry retirement home jailed criminal past organizing new york swindlers surprised charmed apparent kid converts abandoned casino thursday home old small group elderly women makeshift amenities complete kid able receive city free kid compatriots dress santa claus position throughout others unaware kid plans keep money pay scheme huge netting overjoyed brainy decides leave job dancer look employer none brainy cheerfully informs seeing potential gold charley decides muscle reasoning nellie thursday home nellie thursday charley crew kidnap inhabitants move mansion kid learns returns home late night find home deserted money clued oversized oxford footprints kid friends pay charley charley reveals true nature scheme phone conversation moose accomplices angry move confront kid manages slip brainy tracks outside voices disgust days stewing kid surprised meet escaped decides recover sneaking home guise elderly finds charley crew moving time secure using heightened activity kid enters office confronts brief kid overpowers charley makes narrowly avoiding thugs charley sent ensuing chaos allows brainy others later kid returns original nellie thursday home meet moose moran deal appears jeopardy moran arrives charley demands kid reimburse would leave little kid turns tables hitting revealing hidden casino mainly escaped old kid friends hold gangsters police initiate moran charley arrested judge sentenced kid earlier warns eye kid assures necessary attention lie going become main event begins husband free joyously reunites
595909,adventist church pastor michael wife two daughter azaria camping holiday baby sleeping family enjoying barbecue fellow campers cry lindy returns tent check azaria certain sees dingo something mouth running discovers infant everyone joins forces search without assumed lindy saw animal carrying subsequent inquest rules account events tide public opinion soon turns lindy seems accepting disaster befallen gossip begins swell soon accepted statements beliefs widely practised media report rumour name azaria means public quick believe decapitated baby pair scissors part bizarre religious officials find new forensics lot circumstantial small wooden coffin michael uses receptacle packs reopen eventually lindy charged seven months ignores advice play sympathy appears emotionless convincing onlookers guilty crime trial faith religion belief wife stumbles suggesting concealing october lindy found guilty sentenced life imprisonment hard michael found guilty accessory given suspended three years searching body english tourist fell police discover small item clothing identified jacket lindy insisted azaria wearing recovered early immediately released case reopened convictions chamberlains
5272176,president way give traveling man shows reporter tries ask member secret service president enters shot man main alex thomas grazed bullet hits shooter gunned alex secret service president dies kate crawford investigative starts asking questions anyone questions goes alex house tell head thomas sees men hiding throws kate water dives thomas jumps water kill two hitmen third hitman drives inform boss able link hitmen man called jack baldwin agent thomas secret service members attack location jack baldwin escapes later caught thomas crawford suspicious vaughan stevens agent previous links reviewing film assassination thomas discovers stevens handed assassin gun thomas leaves find stevens kate stays thomas arrives home finds thomas sees car leaving receives call baldwin kate attacked baldwin thomas arrives kills thomas arrives first home see car left house pulling thomas discovers first lady wanted husband killed due fact president unfaithful weeks later kate thomas thomas says still know wanted president
1952976,film opens young stands outside school waiting flash forward see dahlia midst bitter mediation kyle custody cecilia kyle wants cecilia live closer apartment jersey dahlia wants move roosevelt found good kyle threatens sue full custody feels distance also claims dahlia dahlia cecilia see apartment complex roosevelt blocks new superintendent dilapidated building veeck manager murray cecilia sneaks roof finds hello kitty backpack near large water leave bag murray promises cecilia one claims disliked wants desperately live dahlia agrees move shortly bedroom ceiling begins leak dark source apartment rimsky family lived month dahlia enters finds dark water flowing every walls finds family portrait former girl dahlia complains veeck murray former little despite insistence dahlia soon dreams little girl appears cecilia returning visit appearance changes every time dahlia looks looks like girl portrait cecilia started according new teacher fitting spending much time imaginary named psychologist dahlia declines tells cecilia ignore although veeck said dahlia discovers hello kitty backpack laundry cecilia later finds name backpack reads shoddily patched leaks cecilia appears get fight appears control hand taken bathroom passes dark water gushes toilets meeting kyle picks takes dahlia breaks find daughter strange lead roof ladder water finds police discover father thought mother thought girl girl left alone abandoned apartment fell water veeck left aware refused fix water problem plaguing veeck arrested murray dahlia agrees move closer kyle shared custody go dahlia cecilia taking girl hooded bathrobe comes wanting dahlia read hears voices realizes girl natasha begs dahlia leave dahlia rushes bathroom save natasha locks cecilia shower compartment holds dahlia pleads promising mother natasha lets cecilia go floods causing dahlia die spirits shown walking kyle picks cecilia police weeks two go back pick rest cecilia flashback mother looking pictures ghost braids hair comforts always momentarily horrified malfunction weird behavior perhaps noticing hair finally takes apartment jersey
24225279,story begins young jewish completing senior year high small neighborhood brooklyn falling apart one traditions keeping neighborhood newly arrived miss lombardo grew neighborhood returned sing one cold christmas miss lombardo leaving neighborhood party young man hails attempts mug self bites hand release grip screams pain quickly making cab driver jokes starting meter first day miss lombardo runs difficulty students uninterested one student named dominic gets scolded bringing stolen watches school grounds putting feet day sing leader miss lombardo recognizes dominic mugger bandage hand decides blackmail leader senior class along rightfully school kids work hard plan sing hannah dominic clash along way hannah uses traditional sing planning strategies dominic wants introduce flavor youth order put dominic hannah miss lombardo suggests hannah accompany dominic local two equally hesitant hannah agrees terms end hannah uses dominic make ex due hannah dominic start seeing different dominic accompanies hannah walk home two share romantic two finally uniting getting education informs school close doors forever end semester enough resources complete fuels kids work even harder productions neighborhood comes together even help finance despite school things starting look dominic reluctantly accompanies brother robbery sole source income already risk failure due upcoming classmate saw dominic standing outside diner time crime informed hannah hannah confronts dominic promises get money back steals money back brother returns restoring faith recent events discouraged dominic fulfilling leader duties skipping moment great need main performer falls dominic steps save sheds demeanor exceeds underclassmen seniors perform sold end hannah makes moving speech motivating community rejoice always remember despite compromising completed successful sing proved
2462689,infuriated told write one final column laid newspaper ann mitchell prints letter fictional unemployed threatening suicide christmas eve protest note causes sensation competition suspects fraud starts newspaper editor rehires mitchell comes scheme hiding fictional nature exploiting sensation caused fake letter boost demands bonus equal reviewing number derelicts shown paper claiming penned original suicide mitchell editor henry connell hire john willoughby former baseball player tramp need money repair injured play john mitchell starts pen article series elaborating ideas disregard people willoughby gets new suit plush hotel suite tramp friend launches extended diatribe lots heels incessantly focus getting money willoughby hired give radio guided mitchell promised week write paid norton arnold willoughby turns bribe admit whole thing publicity gives dashes countryside ride playing harmonica ocarina show john doe recognized brought city met gives monologue inspired start local john doe john doe philosophy spreads across developing broad grassroots movement whose simple slogan better far altruistic norton plans channel support doe support national political culmination norton instructed mitchell write speech willoughby announces foundation new political party endorses norton presidential come believe john doe philosophy realizes tries expose first stymied attempts talk mind nationwide radio audience rally instead reading prepared exposed fake claims like everyone staff frustrated willoughby intends commit suicide jumping roof city hall christmas indicated original john doe intervention mitchell followers john doe clubs persuades renege threat kill point reference jesus christ historical already died sake film ends connell turning norton try lick
20532852,line people drool window shop market butcher buzz short series gags ensues buzz dishonestly since woody broke sneaks gets thrown way woody collides bottle invisible ink turns partially buzz see parts somewhat gruesome thinks sweeps trap door get rid woody realizes douses rest ink order pose


In [0]:
n_total = summaries_df.count()
print(n_total)

42306


In [0]:
from pyspark.sql.functions import explode, split
word_wise_summaryRDD = summaries_df.select(summaries_df.ID, explode(split(summaries_df["Summary"], " "))).rdd

In [0]:
wc_summary_pair = word_wise_summaryRDD.map(lambda x: ((x[0], x[1]),1)).reduceByKey(lambda x,y: x+y)
wc_summary_pair.collect()

Out[15]: [(('18459346', 'politicians'), 1),
 (('4295195', 'take'), 2),
 (('2617036', 'constantly'), 1),
 (('12149364', 'summoned'), 1),
 (('11922721', 'want'), 1),
 (('5429460', 'working'), 2),
 (('225535', 'says'), 1),
 (('27863441', 'attempt'), 1),
 (('17215446', 'areas'), 1),
 (('25342113', 'kidnapped'), 1),
 (('4744487', 'headquarters'), 1),
 (('780873', 'sufficient'), 1),
 (('258732', 'got'), 1),
 (('32496091', 'ends'), 1),
 (('2164332', 'koala'), 1),
 (('11058226', 'simultaneously'), 1),
 (('11216972', 'accused'), 1),
 (('6563473', 'agonized'), 1),
 (('852742', 'tensions'), 1),
 (('12108808', 'killing'), 1),
 (('5387833', 'sing'), 1),
 (('35954378', 'convinces'), 1),
 (('1055198', 'deceased'), 1),
 (('13149824', 'lets'), 1),
 (('3303044', 'finds'), 1),
 (('27371888', 'plotting'), 1),
 (('10980875', 'attention'), 1),
 (('13040946', 'chinese'), 2),
 (('10644072', 'audience'), 1),
 (('10644072', 'bless'), 2),
 (('17124781', 'kills'), 3),
 (('10016997', 'viper'), 2),
 (('8153846', 'p

In [0]:
tf_word = wc_summary_pair.map(lambda x : (x[0][1], (x[0][0], x[1])))
tf_word.collect()

Out[16]: [('david', ('1335380', 2)),
 ('black', ('8471210', 1)),
 ('high', ('10799612', 1)),
 ('comes', ('2408359', 1)),
 ('help', ('20631892', 2)),
 ('ignores', ('1591973', 1)),
 ('part', ('17371197', 1)),
 ('ambulance', ('17060199', 1)),
 ('see', ('9031450', 1)),
 ('advantage', ('1520023', 1)),
 ('word', ('14582951', 2)),
 ('except', ('33942920', 1)),
 ('leader', ('2322506', 1)),
 ('actually', ('19332675', 1)),
 ('front', ('27368886', 1)),
 ('heard', ('5925279', 1)),
 ('captors', ('26499399', 1)),
 ('behind', ('7429667', 1)),
 ('journalist', ('2311219', 1)),
 ('family', ('2311219', 1)),
 ('class', ('20120996', 2)),
 ('makes', ('20533021', 1)),
 ('comedy', ('8262659', 1)),
 ('small', ('5239234', 1)),
 ('realizes', ('6593182', 1)),
 ('link', ('6593182', 1)),
 ('works', ('3257958', 1)),
 ('passionate', ('14983036', 1)),
 ('murderer', ('15052685', 1)),
 ('shoot', ('8204853', 1)),
 ('jumping', ('8204853', 1)),
 ('said', ('204774', 1)),
 ('decide', ('26283768', 1)),
 ('unable', ('265033', 

In [0]:
n_word_in_docs = wc_summary_pair.map(lambda x: (x[0][1], 1)).reduceByKey(lambda x, y: x+y)
n_word_in_docs.collect()

Out[17]: [('october', 201),
 ('completing', 216),
 ('two', 12863),
 ('eliminate', 319),
 ('notable', 133),
 ('continues', 2271),
 ('hearing', 1153),
 ('settles', 209),
 ('minutes', 568),
 ('boy', 2724),
 ('named', 6303),
 ('telephones', 99),
 ('commit', 775),
 ('give', 3989),
 ('starts', 4774),
 ('young', 8966),
 ('trying', 4225),
 ('story', 6791),
 ('shell', 144),
 ('hours', 796),
 ('crashes', 817),
 ('returns', 5958),
 ('officer', 1998),
 ('forces', 2251),
 ('whose', 3112),
 ('chance', 1728),
 ('offers', 2890),
 ('scene', 2882),
 ('away', 6562),
 ('eventually', 5620),
 ('lodges', 20),
 ('branding', 17),
 ('current', 590),
 ('way', 6634),
 ('collapse', 194),
 ('dressed', 759),
 ('heavily', 438),
 ('unknown', 1100),
 ('mob', 484),
 ('term', 177),
 ('son', 4934),
 ('student', 1511),
 ('rage', 312),
 ('intentions', 277),
 ('salesgirl', 13),
 ('meets', 6454),
 ('eyes', 954),
 ('efforts', 779),
 ('shape', 191),
 ('long', 3339),
 ('able', 3305),
 ('filled', 756),
 ('horrible', 222),
 ('ston

In [0]:
idf = n_word_in_docs.map(lambda x : (x[0], np.log(n_total/x[1])))
idf.collect()

Out[18]: [('october', 5.34937929088802),
 ('completing', 5.277405791262931),
 ('two', 1.19057394686838),
 ('eliminate', 4.887493096162251),
 ('notable', 5.762335070725342),
 ('continues', 2.924708656841537),
 ('hearing', 3.6025616786780366),
 ('settles', 5.310349946982285),
 ('minutes', 4.310562780225944),
 ('boy', 2.7428275316776927),
 ('named', 1.9039032094339856),
 ('telephones', 6.057564348812505),
 ('commit', 3.9998211695937487),
 ('give', 2.361388347041689),
 ('starts', 2.18174439204832),
 ('young', 1.5514892742317108),
 ('trying', 2.3039096591558215),
 ('story', 1.8293307138333041),
 ('shell', 5.682870899371095),
 ('hours', 3.9730850131027124),
 ('crashes', 3.947045104087093),
 ('returns', 1.9601940656738681),
 ('officer', 3.0527822397385966),
 ('forces', 2.9335543580403636),
 ('whose', 2.6096633136488134),
 ('chance', 3.197964249583095),
 ('offers', 2.6836724178406177),
 ('scene', 2.6864444223876283),
 ('away', 1.8636334854260486),
 ('eventually', 2.018597256059359),
 ('lodges'

In [0]:
tf_idf = tf_word.join(idf)
tf_idf_w = tf_idf.map(lambda x: (x[0], (x[1][0][0], x[1][0][1], x[1][1],(x[1][0][1]*x[1][1]))))
tf_idf_w.collect()

Out[19]: [('appears', ('2940516', 1, 2.602937907994904, 2.602937907994904)),
 ('appears', ('22541211', 1, 2.602937907994904, 2.602937907994904)),
 ('appears', ('645071', 1, 2.602937907994904, 2.602937907994904)),
 ('appears', ('3610464', 1, 2.602937907994904, 2.602937907994904)),
 ('appears', ('557036', 1, 2.602937907994904, 2.602937907994904)),
 ('appears', ('17432044', 1, 2.602937907994904, 2.602937907994904)),
 ('appears', ('6465014', 1, 2.602937907994904, 2.602937907994904)),
 ('appears', ('3210534', 1, 2.602937907994904, 2.602937907994904)),
 ('appears', ('577618', 3, 2.602937907994904, 7.808813723984712)),
 ('appears', ('442675', 1, 2.602937907994904, 2.602937907994904)),
 ('appears', ('167857', 2, 2.602937907994904, 5.205875815989808)),
 ('appears', ('142435', 2, 2.602937907994904, 5.205875815989808)),
 ('appears', ('4292601', 1, 2.602937907994904, 2.602937907994904)),
 ('appears', ('1908968', 1, 2.602937907994904, 2.602937907994904)),
 ('appears', ('4921024', 1, 2.6029379079949

In [0]:
single_term_queries = sc.textFile("dbfs:/FileStore/tables/single_queries.txt").collect()
single_term_queries

Out[20]: ['romance', 'comedy', 'action', 'thriller', 'horror']

In [0]:
for term_query in single_term_queries:
    print(f"{term_query}:")
    top_term_movies = tf_idf_w.filter(lambda x : x[0] == term_query)
    sorted_top_term_movies = top_term_movies.sortBy(lambda x : -x[1][3])
    sorted_top_term_movies_mapped = sorted_top_term_movies.map(lambda x : (x[1][0], float(x[1][3])))
    sorted_top_term_movies_mapped_top_10 = sorted_top_term_movies_mapped.take(10)

    top_movies_rdd = sc.parallelize(sorted_top_term_movies_mapped_top_10)
    top_movies_df = top_movies_rdd.toDF(['ID', 'tfidf_Weight'])

    result_df = movie_df.join(top_movies_df, movie_df.ID == top_movies_df.ID, "inner")

    result_df.select('Name', 'tfidf_Weight').show(truncate=False)

romance:
+------------------------------+------------------+
|Name                          |tfidf_Weight      |
+------------------------------+------------------+
|4 Romance                     |20.21667477889857 |
|The manor of Araucaima        |12.130004867339144|
|The English Patient           |12.130004867339144|
|Future Cops                   |12.130004867339144|
|Second Fiddle                 |12.130004867339144|
|The Great Outdoors            |12.130004867339144|
|Two Much                      |8.08666991155943  |
|Beyond the Valley of the Dolls|8.08666991155943  |
|Restless                      |8.08666991155943  |
|Rebecca of Sunnybrook Farm    |8.08666991155943  |
+------------------------------+------------------+

comedy:
+------------------------------------+------------------+
|Name                                |tfidf_Weight      |
+------------------------------------+------------------+
|Where the Truth Lies                |18.383600742873885|
|General Motors 50th A

In [0]:
query_rdd = sc.textFile("/FileStore/tables/multi_queries.txt")

In [0]:
multi_term_queries = query_rdd.collect()

In [0]:
movie_tf_idf_rdd = tf_idf_w.map(lambda x: (x[0], (x[1][0], x[1][3]))).cache()

for multi_query in multi_term_queries:
    print(f"{multi_query}:")

    query_terms_rdd = sc.parallelize(multi_query.split(' '))

    term_frequencies = (query_terms_rdd.map(lambda term: (term, 1))            
                    .reduceByKey(lambda x, y: x + y))

    term_movie_joined_rdd = term_frequencies.join(movie_tf_idf_rdd)

    cosine_magnitude_dot_prod = (term_movie_joined_rdd
                 .map(lambda x: (x[1][1][0], (x[1][0] * x[1][1][1], x[1][0] ** 2, x[1][1][1] ** 2)))
                 .reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1], x[2] + y[2])))
    
    cosine_similarity_rdd = cosine_magnitude_dot_prod.map(lambda x: (x[0], float(1 - x[1][0] / (np.sqrt(x[1][1]) * np.sqrt(x[1][2])))))

    sorted_cosine_similarity_rdd = cosine_similarity_rdd.sortBy(lambda x: -x[1])

    cosine_similarity_df = sorted_cosine_similarity_rdd.toDF(["ID", "cosineSimilarity"])

    final_movie_similarity_df = movie_df.join(cosine_similarity_df, movie_df.ID == cosine_similarity_df.ID, "inner")

    final_movie_similarity_df.select('Name', 'cosineSimilarity').show(10, False)

romance love drama:
+----------------------+-------------------+
|Name                  |cosineSimilarity   |
+----------------------+-------------------+
|Clannad               |0.25514990715762176|
|Mister Ten Per Cent   |0.21985788517879812|
|Mannar Mathai Speaking|0.21985788517879812|
|The manor of Araucaima|0.2091440084358399 |
|Lust, Caution         |0.18726614492582905|
|The Drama Kids        |0.18726614492582905|
|Leader                |0.18726614492582905|
|Grandmaster           |0.18726614492582905|
|Hannah and Her Sisters|0.172660116704492  |
|The Perfect Man       |0.172660116704492  |
+----------------------+-------------------+
only showing top 10 rows

action adventure thriller:
+-------------------------------------+--------------------+
|Name                                 |cosineSimilarity    |
+-------------------------------------+--------------------+
|Kranti Kshetra                       |0.026480520144794406|
|Jeeva                                |0.026480520144