Task is to find all pairs of words that co-occur within paragraphs, but count each pair at most once per paragraph.

In [18]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.window import Window as W

spark=SparkSession.builder.getOrCreate()

cartoon_with_filename = spark.read.csv("/home/jovyan/host/Desktop/All-seasons.csv",inferSchema=True,header=True)
cartoon_df = cartoon_with_filename.dropna()

cartoon_df.show()

+------+-------+---------------+--------------------+
|Season|Episode|      Character|                Line|
+------+-------+---------------+--------------------+
|    10|      1|           Stan|You guys, you guy...|
|    10|      1|           Kyle|Going away? For h...|
|    10|      1|           Stan|            Forever.|
|    10|      1|           Chef|     I'm sorry boys.|
|    10|      1|           Stan|Chef said he's be...|
|    10|      1|           Chef|                Wow!|
|    10|      1|  Mrs. Garrison|Chef?? What kind ...|
|    10|      1|           Chef|What's the meanin...|
|    10|      1|  Mrs. Garrison|I hope you're mak...|
|    10|      1|        Cartman|I'm gonna miss hi...|
|    10|      1|           Stan|Dude, how are we ...|
|    10|      1|Mayor McDaniels|And we will all m...|
|    10|      1|          Jimbo|            Bye-bye!|
|    10|      1|         Gerald|           Good-bye!|
|    10|      1|     Mr. Mackey|            So long!|
|    10|      1|          A 

In [19]:
filename=(
    cartoon_df
    .withColumn("Filename",
F.lit("Dialogue.csv")))
filename.show()
                           

+------+-------+---------------+--------------------+------------+
|Season|Episode|      Character|                Line|    Filename|
+------+-------+---------------+--------------------+------------+
|    10|      1|           Stan|You guys, you guy...|Dialogue.csv|
|    10|      1|           Kyle|Going away? For h...|Dialogue.csv|
|    10|      1|           Stan|            Forever.|Dialogue.csv|
|    10|      1|           Chef|     I'm sorry boys.|Dialogue.csv|
|    10|      1|           Stan|Chef said he's be...|Dialogue.csv|
|    10|      1|           Chef|                Wow!|Dialogue.csv|
|    10|      1|  Mrs. Garrison|Chef?? What kind ...|Dialogue.csv|
|    10|      1|           Chef|What's the meanin...|Dialogue.csv|
|    10|      1|  Mrs. Garrison|I hope you're mak...|Dialogue.csv|
|    10|      1|        Cartman|I'm gonna miss hi...|Dialogue.csv|
|    10|      1|           Stan|Dude, how are we ...|Dialogue.csv|
|    10|      1|Mayor McDaniels|And we will all m...|Dialogue.

In [20]:
file_df=filename.select("Filename","Line")
file_df.show(truncate=False)

+------------+-----------------------------------------------------------------------------------------+
|Filename    |Line                                                                                     |
+------------+-----------------------------------------------------------------------------------------+
|Dialogue.csv|You guys, you guys! Chef is going away.                                                  |
|Dialogue.csv|Going away? For how long?                                                                |
|Dialogue.csv|Forever.                                                                                 |
|Dialogue.csv|I'm sorry boys.                                                                          |
|Dialogue.csv|Chef said he's been bored, so he joining a group called the Super Adventure Club.        |
|Dialogue.csv|Wow!                                                                                     |
|Dialogue.csv|Chef?? What kind of questions do you thin

In [21]:
section=(
    W
    .orderBy("Filename")
)

southpark_df=(
    file_df
    .withColumn("Section",F.row_number()
                .over(section))
)
southpark_df.show()
                            

+------------+--------------------+-------+
|    Filename|                Line|Section|
+------------+--------------------+-------+
|Dialogue.csv|You guys, you guy...|      1|
|Dialogue.csv|Going away? For h...|      2|
|Dialogue.csv|            Forever.|      3|
|Dialogue.csv|     I'm sorry boys.|      4|
|Dialogue.csv|Chef said he's be...|      5|
|Dialogue.csv|                Wow!|      6|
|Dialogue.csv|Chef?? What kind ...|      7|
|Dialogue.csv|What's the meanin...|      8|
|Dialogue.csv|I hope you're mak...|      9|
|Dialogue.csv|I'm gonna miss hi...|     10|
|Dialogue.csv|Dude, how are we ...|     11|
|Dialogue.csv|And we will all m...|     12|
|Dialogue.csv|            Bye-bye!|     13|
|Dialogue.csv|           Good-bye!|     14|
|Dialogue.csv|            So long!|     15|
|Dialogue.csv|      So long, Chef!|     16|
|Dialogue.csv|     Good-bye, Chef!|     17|
|Dialogue.csv|Good-bye, Chef! H...|     18|
|Dialogue.csv|        Good-bye! ..|     19|
|Dialogue.csv|Draw two card, fa.

In [33]:
separated=southpark_df.withColumn("Separated words", F.explode(F.split(F.col("Line"),r'\W+')))
separated.show()

+------------+--------------------+-------+---------------+
|    Filename|                Line|Section|Separated words|
+------------+--------------------+-------+---------------+
|Dialogue.csv|You guys, you guy...|      1|            You|
|Dialogue.csv|You guys, you guy...|      1|           guys|
|Dialogue.csv|You guys, you guy...|      1|            you|
|Dialogue.csv|You guys, you guy...|      1|           guys|
|Dialogue.csv|You guys, you guy...|      1|           Chef|
|Dialogue.csv|You guys, you guy...|      1|             is|
|Dialogue.csv|You guys, you guy...|      1|          going|
|Dialogue.csv|You guys, you guy...|      1|           away|
|Dialogue.csv|You guys, you guy...|      1|               |
|Dialogue.csv|Going away? For h...|      2|          Going|
|Dialogue.csv|Going away? For h...|      2|           away|
|Dialogue.csv|Going away? For h...|      2|            For|
|Dialogue.csv|Going away? For h...|      2|            how|
|Dialogue.csv|Going away? For h...|     

In [41]:
cartoon=(
    separated
    .filter(F.col("Separated words")!="")
)
cartoon.show()

+------------+--------------------+-------+---------------+
|    Filename|                Line|Section|Separated words|
+------------+--------------------+-------+---------------+
|Dialogue.csv|You guys, you guy...|      1|            You|
|Dialogue.csv|You guys, you guy...|      1|           guys|
|Dialogue.csv|You guys, you guy...|      1|            you|
|Dialogue.csv|You guys, you guy...|      1|           guys|
|Dialogue.csv|You guys, you guy...|      1|           Chef|
|Dialogue.csv|You guys, you guy...|      1|             is|
|Dialogue.csv|You guys, you guy...|      1|          going|
|Dialogue.csv|You guys, you guy...|      1|           away|
|Dialogue.csv|Going away? For h...|      2|          Going|
|Dialogue.csv|Going away? For h...|      2|           away|
|Dialogue.csv|Going away? For h...|      2|            For|
|Dialogue.csv|Going away? For h...|      2|            how|
|Dialogue.csv|Going away? For h...|      2|           long|
|Dialogue.csv|            Forever.|     

In [53]:
orders=(
    W
    .partitionBy("Section")
    .orderBy("Section")
)


In [54]:
cartoon=(
    cartoon_df
    .withColumn("Prev_word",F.lag("Separated words")
                .over(orders))
    .filter(F.col("Prev_word").isNotNull())
    .withColumn("Pairs",F.concat_ws(" ",F.col("Prev_word"),F.col("Separated words")))


)
cartoon.select("Section","Separated words","Prev_word","Pairs")
cartoon.show()

+------------+--------------------+-------+---------------+-----------+------------------+
|    Filename|                Line|Section|Separated words|  Prev_word|             Pairs|
+------------+--------------------+-------+---------------+-----------+------------------+
|Dialogue.csv|You guys, you guy...|      1|             is|       Chef|           Chef is|
|Dialogue.csv|You guys, you guy...|      1|          going|         is|          is going|
|Dialogue.csv|You guys, you guy...|      1|           away|      going|        going away|
|Dialogue.csv|Chef said he's be...|      5|          bored|       been|        been bored|
|Dialogue.csv|Chef said he's be...|      5|             so|      bored|          bored so|
|Dialogue.csv|Chef said he's be...|      5|             he|         so|             so he|
|Dialogue.csv|Chef said he's be...|      5|        joining|         he|        he joining|
|Dialogue.csv|Chef said he's be...|      5|              a|    joining|         joining a|

In [55]:
final=cartoon.select("Filename","Section","Pairs")
final.show()

+------------+-------+------------------+
|    Filename|Section|             Pairs|
+------------+-------+------------------+
|Dialogue.csv|      1|           Chef is|
|Dialogue.csv|      1|          is going|
|Dialogue.csv|      1|        going away|
|Dialogue.csv|      5|        been bored|
|Dialogue.csv|      5|          bored so|
|Dialogue.csv|      5|             so he|
|Dialogue.csv|      5|        he joining|
|Dialogue.csv|      5|         joining a|
|Dialogue.csv|      5|           a group|
|Dialogue.csv|      5|      group called|
|Dialogue.csv|      5|        called the|
|Dialogue.csv|      5|         the Super|
|Dialogue.csv|      5|   Super Adventure|
|Dialogue.csv|      5|    Adventure Club|
|Dialogue.csv|      7|      questions do|
|Dialogue.csv|      7|            do you|
|Dialogue.csv|      7|         you think|
|Dialogue.csv|      7| think adventuring|
|Dialogue.csv|      7|adventuring around|
|Dialogue.csv|      7|        around the|
+------------+-------+------------

In [56]:
data_conc=(
    ("1","Mrs Garrison"),
    ("2","Kyle"),
    ("3","Mr Mackey"),
    ("4","Randy"),
    ("5","Stan"),
)
columns=["ConceptID","ConceptValue"]
df_koncepti=spark.createDataFrame(data=data_conc,schema=columns)
df_koncepti.show()

+---------+------------+
|ConceptID|ConceptValue|
+---------+------------+
|        1|Mrs Garrison|
|        2|        Kyle|
|        3|   Mr Mackey|
|        4|       Randy|
|        5|        Stan|
+---------+------------+



In [59]:
df_filter=(
    final
    .join(df_koncepti,F.col("Pairs")
          .contains(F.col("Conceptvalue")),"Inner")
)
df_filter.orderBy("Section").show(truncate=False)

+------------+-------+---------------+---------+------------+
|Filename    |Section|Pairs          |ConceptID|ConceptValue|
+------------+-------+---------------+---------+------------+
|Dialogue.csv|74     |rectum Kyle    |2        |Kyle        |
|Dialogue.csv|101    |asshole Kyle   |2        |Kyle        |
|Dialogue.csv|292    |ticket Kyle    |2        |Kyle        |
|Dialogue.csv|317    |on Kyle        |2        |Kyle        |
|Dialogue.csv|317    |Kyle I         |2        |Kyle        |
|Dialogue.csv|334    |for Kyle       |2        |Kyle        |
|Dialogue.csv|336    |know Kyle      |2        |Kyle        |
|Dialogue.csv|336    |Kyle s         |2        |Kyle        |
|Dialogue.csv|345    |invite Kyle    |2        |Kyle        |
|Dialogue.csv|350    |Goodbye Kyle   |2        |Kyle        |
|Dialogue.csv|358    |difference Stan|5        |Stan        |
|Dialogue.csv|358    |Stan Maybe     |5        |Stan        |
|Dialogue.csv|383    |getting Kyle   |2        |Kyle        |
|Dialogu

In [58]:
df_counts=df_filter.groupBy("ConceptValue").agg(F.count("*").alias("Freq"))
df_counts.show()

+------------+----+
|ConceptValue|Freq|
+------------+----+
|Mrs Garrison|   6|
|        Kyle|1692|
|   Mr Mackey|  32|
|       Randy| 152|
|        Stan|1225|
+------------+----+

